]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
go
authorThad Hughes <thad.hughes@gmail.com>
Tue, 22 Nov 2011 04:33:50 +0000 (20:33 -0800)
committerThad Hughes <thad.hughes@gmail.com>
Tue, 13 Dec 2011 18:39:44 +0000 (10:39 -0800)
13 files changed:
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
src/com/hughes/android/dictionary/engine/IndexBuilder.java
src/com/hughes/android/dictionary/engine/IndexedEntry.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java [moved from src/com/hughes/android/dictionary/WiktionarySplitter.java with 99% similarity]
src/com/hughes/android/dictionary/parser/DictFileParser.java
src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java
src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiHeading.java
src/com/hughes/android/dictionary/parser/WikiTokenizer.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiWord.java.old [new file with mode: 0644]

index ad8099491c9360abaa87271ced711715e7bac353..1be2af0064f772ccd7cbd9a65a3f2d511f8aa5ad 100644 (file)
@@ -137,7 +137,8 @@ public class DictionaryBuilder {
           if (enIndex < 0 || enIndex >= 2) {
             fatalError("Must be 1 or 2: " + prefix + "EnIndex");
           }
-          new EnWiktionaryXmlParser(dictionaryBuilder, langPattern, langCodePattern, enIndex).parse(file, Integer.parseInt(pageLimit));
+          new EnWiktionaryXmlParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
+              langPattern, langCodePattern, enIndex != 0).parse(file, Integer.parseInt(pageLimit));
         } else {
           fatalError("Invalid or missing input format: " + inputFormat);
         }
index c49305e87677ae45cc0c8e8f59e75e7fc72da849..17190a94995afd350c800a7d10d8c9cac37fecc6 100644 (file)
@@ -22,22 +22,10 @@ public class DictionaryBuilderMain extends TestCase {
   
   
   public static void main(final String[] args) throws Exception {
-    DictionaryBuilder.main(new String[] {
-        "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
-        "--lang1=DE",
-        "--lang2=EN",
-        "--dictInfo=@dictInputs/de-en_chemnitz.info",
-
-        "--input1=dictInputs/de-en_chemnitz.txt",
-        "--input1Name=chemnitz",
-        "--input1Charset=UTF8",
-        "--input1Format=chemnitz",
-    });
-
 
     Lang[] langs1 = new Lang[] { 
         new Lang("^English$", "EN"),
-        new Lang("^German$", "DE"),
+        //new Lang("^German$", "DE"),
     };
     Lang[] langs2 = new Lang[] { 
         new Lang("^Italian$", "IT"),
@@ -131,6 +119,18 @@ public class DictionaryBuilderMain extends TestCase {
       }  // langs2
     }  // langs1
 
+    DictionaryBuilder.main(new String[] {
+        "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
+        "--lang1=DE",
+        "--lang2=EN",
+        "--dictInfo=@dictInputs/de-en_chemnitz.info",
+
+        "--input1=dictInputs/de-en_chemnitz.txt",
+        "--input1Name=chemnitz",
+        "--input1Charset=UTF8",
+        "--input1Format=chemnitz",
+    });
+
     DictionaryBuilder.main(new String[] {
         "--dictOut=dictOutputs/de-en_all.quickdic",
         "--lang1=DE",
index a564b93bf107eadd2bfd2eab736897c0becc9b09..693db6c707b4b8bb93cf041aefc83d62df979534 100644 (file)
@@ -20,14 +20,15 @@ public class DictionaryBuilderTest extends TestCase {
         "--lang2=EN",
         "--dictInfo=SomeWikiData",
 
+        /*
         "--input3=wikiSplit/english.data",
-        "--input3Name=enwiktionary.italian",
+        "--input3Name=enwiktionary.english",
         "--input3Format=enwiktionary",
         "--input3LangPattern=Italian",
         "--input3LangCodePattern=it",
         "--input3EnIndex=2",
         "--input3PageLimit=1000",
-
+*/
         "--input4=wikiSplit/italian.data",
         "--input4Name=enwiktionary.italian",
         "--input4Format=enwiktionary",
index 172be90983e4d4bf22356e7178cf43b4ce6d11c4..cab33187d57f1e9bd6ae60ba6f50c77486d5bfc6 100644 (file)
@@ -12,6 +12,7 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 
 import com.hughes.android.dictionary.engine.Index.IndexEntry;
+import com.hughes.android.dictionary.parser.DictFileParser;
 
 
 public class IndexBuilder {
@@ -40,6 +41,10 @@ public class IndexBuilder {
 //      System.out.println("TOKEN: " + tokenData.token);
       for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToEntry : tokenData.typeToEntries.entrySet()) {
         for (final IndexedEntry entryData : typeToEntry.getValue()) {
+          if (entryData.index() == -1) {
+            entryData.addToDictionary(dictionaryBuilder.dictionary);
+            assert entryData.index() >= 0;
+          }
           if (tokenEntryDatas.add(entryData)) {
             rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
             ++numRows;
@@ -97,12 +102,21 @@ public class IndexBuilder {
     return entries;
   }
 
-  public void addEntryWithTokens(final IndexedEntry entryData, final Set<String> tokens,
+  public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set<String> tokens,
       final EntryTypeName entryTypeName) {
     for (final String token : tokens) {
-      getOrCreateEntries(token, entryTypeName).add(entryData);
+      getOrCreateEntries(token, entryTypeName).add(indexedEntry);
     }    
   }
-  
 
+  public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+      final EntryTypeName singleTokenEntryTypeName, final EntryTypeName multiTokenEntryTypeName) {
+    final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
+    addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? singleTokenEntryTypeName : multiTokenEntryTypeName);
+  }
+
+  public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+      final EntryTypeName entryTypeName) {
+    addEntryWithString(indexedEntry, untokenizedString, entryTypeName, entryTypeName);
+  }
 }
diff --git a/src/com/hughes/android/dictionary/engine/IndexedEntry.java b/src/com/hughes/android/dictionary/engine/IndexedEntry.java
new file mode 100644 (file)
index 0000000..dedb679
--- /dev/null
@@ -0,0 +1,19 @@
+/**
+ * 
+ */
+package com.hughes.android.dictionary.engine;
+
+import com.hughes.util.IndexedObject;
+
+public class IndexedEntry extends IndexedObject {
+  public IndexedEntry(final AbstractEntry entry) {
+    super(-1);
+    this.entry = entry;
+  }
+  AbstractEntry entry;
+  
+  public void addToDictionary(Dictionary dictionary) {
+    assert index == -1;
+    index = entry.addToDictionary(dictionary);
+  }
+}
\ No newline at end of file
similarity index 99%
rename from src/com/hughes/android/dictionary/WiktionarySplitter.java
rename to src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
index 89173c5c51da95c87e57f8e83be13b6d4c597a78..685b23867d008846dfcedf67611b45cb265c6fea 100644 (file)
@@ -1,4 +1,4 @@
-package com.hughes.android.dictionary;
+package com.hughes.android.dictionary.engine;
 
 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
index 909a6b77577f573e50f6f3175454023b60c4d363..20611ae3c406f7e51c3cf77b314fc71b9a52c003 100644 (file)
@@ -42,7 +42,7 @@ public class DictFileParser {
   static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
   
   static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+");
-  static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
+  public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
 
   static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$");
 
index cfdf0f14ff6e0802ba6c783263bf8d36b0d32d81..95e910d9656a342eac1bc8b3799e238e91014f8f 100644 (file)
@@ -12,7 +12,11 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.EntryTypeName;
 import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.IndexedEntry;
+import com.hughes.android.dictionary.engine.PairEntry;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
 
 public class EnWiktionaryXmlParser {
   
@@ -29,19 +33,18 @@ public class EnWiktionaryXmlParser {
       "Particle|Interjection|Pronominal adverb" +
       "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
 
-  final DictionaryBuilder dictBuilder;
-  
-  final IndexBuilder[] indexBuilders;
+  final IndexBuilder enIndexBuilder;
+  final IndexBuilder otherIndexBuilder;
   final Pattern langPattern;
   final Pattern langCodePattern;
-  final int enIndexBuilder;
+  final boolean swap;
 
-  public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern langPattern, final Pattern langCodePattern, final int enIndexBuilder) {
-    this.dictBuilder = dictBuilder;
-    this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
+  public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
+    this.enIndexBuilder = enIndexBuilder;
+    this.otherIndexBuilder = otherIndexBuilder;
     this.langPattern = langPattern;
     this.langCodePattern = langCodePattern;
-    this.enIndexBuilder = enIndexBuilder;
+    this.swap = swap;
   }
 
   
@@ -92,7 +95,7 @@ public class EnWiktionaryXmlParser {
     if (heading.replaceAll("=", "").equals("English")) {
       doEnglishWord(title, text);
     } else {
-      //doForeignWord(title, text);
+      doForeignWord(title, text);
     }
         
   }  // endPage()
@@ -156,8 +159,8 @@ public class EnWiktionaryXmlParser {
         while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
           if (wikiFunction.name.equals("trans-top")) {
             sense = null;
-            if (wikiFunction.args.size() >= 2) {
-              sense = wikiFunction.args.get(1);
+            if (wikiFunction.args.size() >= 1) {
+              sense = wikiFunction.args.get(0);
               //System.out.println("Sense: " + sense);
             }
           } else if (wikiFunction.name.equals("trans-bottom")) {
@@ -182,75 +185,15 @@ public class EnWiktionaryXmlParser {
         if (colonIndex == -1) {
           continue;
         }
+        
         final String lang = line.substring(0, colonIndex);
         if (!this.langPattern.matcher(lang).find()) {
           continue;
         }
         
-        String rest = line.substring(colonIndex + 1);
-        final StringBuilder lineText = new StringBuilder();
+        String rest = line.substring(colonIndex + 1).trim();
+        doTranslationLine(line, title, sense, rest);
         
-        boolean ttbc = false;
-        WikiFunction wikiFunction;
-        while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
-          if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
-            if (wikiFunction.args.size() < 2) {
-              System.err.println("{{t}} with too few args: " + line + ", title=" + title);
-              continue;
-            }
-            final String langCode = wikiFunction.getArg(0);
-            if (this.langCodePattern.matcher(langCode).matches()) {
-              final String word = wikiFunction.getArg(1);
-              final String gender = wikiFunction.getArg(2);
-              final String transliteration = wikiFunction.getNamedArg("tr");
-            }
-          } else if (wikiFunction.name.equals("qualifier")) {
-            String qualifier = wikiFunction.getArg(0);
-          } else if (encodings.contains(wikiFunction.name)) {
-            rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
-            String gender = wikiFunction.name;
-            for (int i = 0; i < wikiFunction.args.size(); ++i) {
-              gender += "|" + wikiFunction.getArg(i);
-            }
-            rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("g")) {
-            rest = wikiFunction.replaceWith(rest, "{g}");
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("l")) {
-            // encodes text in various langs.
-            rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(1));
-            // TODO: transliteration
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("term")) {
-            // cross-reference to another dictionary
-            rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
-            // TODO: transliteration
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
-            // TODO: put this text aside to use it.
-            rest = wikiFunction.replaceWith(rest, "[" + wikiFunction.getArg(0) + "]");
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("ttbc")) {
-            ttbc = true;
-          } else if (wikiFunction.name.equals("trreq")) {
-          } else if (wikiFunction.name.equals("not used")) {
-            rest = wikiFunction.replaceWith(rest, "[not used]");
-            wikiFunction = null;
-          } else if (wikiFunction.name.equals("t-image")) {
-            // American sign language
-          } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
-            rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
-            wikiFunction = null;
-          } else {
-            System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
-          }
-          if (wikiFunction != null) {
-            rest = wikiFunction.replaceWith(rest, "");
-          }
-        }
       } else if (line.equals("")) {
       } else if (line.startsWith(":")) {
       } else if (line.startsWith("[[") && line.endsWith("]]")) {
@@ -265,6 +208,118 @@ public class EnWiktionaryXmlParser {
     
   }
   
+  private void doTranslationLine(final String line, final String title, final String sense, String rest) {
+
+    // Good chance we'll actually file this one...
+    final PairEntry pairEntry = new PairEntry();
+    final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+
+    final StringBuilder otherText = new StringBuilder();
+    
+    WikiFunction wikiFunction;
+    while ((wikiFunction = WikiFunction.getFunction(rest)) != null) {
+      if (wikiFunction.start > 0) {
+        String plainText = rest.substring(0, wikiFunction.start); 
+        otherText.append("").append(plainText);
+        otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
+      }
+      rest = rest.substring(wikiFunction.end);
+      
+      if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
+        if (wikiFunction.args.size() < 2) {
+          System.err.println("{{t}} with too few args: " + line + ", title=" + title);
+          continue;
+        }
+        final String langCode = wikiFunction.getArg(0);
+        if (this.langCodePattern.matcher(langCode).matches()) {
+          final String word = wikiFunction.getArg(1);
+          final String gender = wikiFunction.getArg(2);
+          final String transliteration = wikiFunction.getNamedArg("tr");
+          if (otherText.length() > 0) {
+            otherText.append("");
+          }
+          otherText.append(word);
+          otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+          if (gender != null) {
+            otherText.append(String.format(" {%s}", gender));
+          }
+          if (transliteration != null) {
+            otherText.append(String.format(" (tr. %s)", transliteration));
+            otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+          }
+        }
+      } else if (wikiFunction.name.equals("qualifier")) {
+        String qualifier = wikiFunction.getArg(0);
+        if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) {
+          System.err.println("weird qualifier: " + line);
+        }
+        otherText.append("(").append(qualifier).append(")");
+      } else if (encodings.contains(wikiFunction.name)) {
+        otherText.append("").append(wikiFunction.getArg(0));
+        otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+      } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
+        otherText.append("{");
+        otherText.append(wikiFunction.name);
+        for (int i = 0; i < wikiFunction.args.size(); ++i) {
+          otherText.append("|").append(wikiFunction.getArg(i));
+        }
+        otherText.append("}");
+      } else if (wikiFunction.name.equals("g")) {
+        otherText.append("{g}");
+      } else if (wikiFunction.name.equals("l")) {
+        // encodes text in various langs.
+        // lang is arg 0.
+        otherText.append("").append(wikiFunction.getArg(1));
+        otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+        // TODO: transliteration
+      } else if (wikiFunction.name.equals("term")) {
+        // cross-reference to another dictionary
+        otherText.append("").append(wikiFunction.getArg(0));
+        otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+        // TODO: transliteration
+      } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
+        // TODO: put this text aside to use it.
+        otherText.append("[").append(wikiFunction.getArg(0)).append("]");
+        otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+      } else if (wikiFunction.name.equals("ttbc")) {
+      } else if (wikiFunction.name.equals("trreq")) {
+      } else if (wikiFunction.name.equals("not used")) {
+        otherText.append("(not used)");
+      } else if (wikiFunction.name.equals("t-image")) {
+        // American sign language
+      } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
+        otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}");
+      } else {
+        System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
+      }
+    }
+    String plainText = rest; 
+    otherText.append("").append(plainText);
+    otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
+    
+    StringBuilder englishText = new StringBuilder();
+    
+    englishText.append(title);
+    if (sense != null) {
+      englishText.append(" (").append(sense).append(")");
+      enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
+    }
+    if (pos != null) {
+      englishText.append(" (").append(pos.toLowerCase()).append(")");
+    }
+    enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+    
+    final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap);
+    pairEntry.pairs.add(pair);
+    assert (pairsAdded.add(pair.toString()));
+    if (pair.toString().equals("libero {m} :: free (adjective)")) {
+      System.out.println();
+    }
+
+  }
+  
+  Set<String> pairsAdded = new LinkedHashSet<String>();
+  
   // -------------------------------------------------------------------------
   
   private void doForeignWord(String title, String text) {
@@ -273,15 +328,33 @@ public class EnWiktionaryXmlParser {
     while ((line = wikiLineReader.readLine()) != null) {
       final WikiHeading wikiHeading = WikiHeading.getHeading(line);
       if (wikiHeading != null) {
-        
         if (wikiHeading.name.equals("Translations")) {
           System.err.println("Translations not in English section: " + title);
         } else if (wikiHeading.name.equals("Pronunciation")) {
           //doPronunciation(wikiLineReader);
         } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
-          
+          doPartOfSpeech(title, wikiHeading, wikiLineReader);
+        }
+      }
+    }
+  }
+
+
+  private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) {
+    String line;
+    System.out.println("***" + title);
+    System.out.println(posHeading.name);
+    while ((line = wikiLineReader.readLine()) != null) {
+      WikiHeading heading = WikiHeading.getHeading(line);
+      if (heading != null) {
+        if (heading.depth <= posHeading.depth) {
+          wikiLineReader.stuffLine(line);
+          return;
         }
       }
+      System.out.println(line);
+      
+      
     }
   }
 
diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old
new file mode 100644 (file)
index 0000000..75f2121
--- /dev/null
@@ -0,0 +1,647 @@
+package com.hughes.android.dictionary.parser;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiWord.FormOf;
+import com.hughes.android.dictionary.parser.WikiWord.Translation;
+import com.hughes.util.ListUtil;
+import com.hughes.util.StringUtil;
+
+public class EnWiktionaryXmlParserOld extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+  
+  static final Pattern partOfSpeechHeader = Pattern.compile(
+      "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+      "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+      "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+      "Ligature|Idiom|Phrase|" +
+      // These are @deprecated:
+      "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+      "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
+      // These are extras I found:
+      "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
+      "Particle|Interjection|Pronominal adverb" +
+      "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
+
+  static final Pattern wikiMarkup =  Pattern.compile("\\[\\[|\\]\\]|''+");
+
+  final DictionaryBuilder dictBuilder;
+  
+  final IndexBuilder[] indexBuilders;
+  final Pattern[] langPatterns;
+  final int enIndexBuilder;
+
+  StringBuilder titleBuilder;
+  StringBuilder textBuilder;
+  StringBuilder currentBuilder = null;
+  
+  static void assertTrue(final boolean condition) {
+    assertTrue(condition, "");
+  }
+
+  static void assertTrue(final boolean condition, final String message) {
+    if (!condition) {
+      System.err.println("Assertion failed, message: " + message);
+      new RuntimeException().printStackTrace(System.err);
+    }
+  }
+
+  public EnWiktionaryXmlParserOld(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
+    assertTrue(langPatterns.length == 2);
+    this.dictBuilder = dictBuilder;
+    this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
+    this.langPatterns = langPatterns;
+    this.enIndexBuilder = enIndexBuilder;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName,
+      Attributes attributes) {
+    currentBuilder = null;
+    if ("page".equals(qName)) {
+      titleBuilder = new StringBuilder();
+      
+      // Start with "\n" to better match certain strings.
+      textBuilder = new StringBuilder("\n");
+    } else if ("title".equals(qName)) {
+      currentBuilder = titleBuilder;
+    } else if ("text".equals(qName)) {
+      currentBuilder = textBuilder;
+    }
+  }
+
+  @Override
+  public void characters(char[] ch, int start, int length) throws SAXException {
+    if (currentBuilder != null) {
+      currentBuilder.append(ch, start, length);
+    }
+  }
+
+  @Override
+  public void endElement(String uri, String localName, String qName)
+      throws SAXException {
+    currentBuilder = null;
+    if ("page".equals(qName)) {
+      endPage();
+    }
+  }
+  
+
+  public void parse(final File file) throws ParserConfigurationException,
+      SAXException, IOException {
+    final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+    parser.parse(file, this);
+  }
+  
+  int pageCount = 0;
+  private void endPage() {
+    title = titleBuilder.toString();
+    ++pageCount;
+    if (pageCount % 1000 == 0) {
+      System.out.println("pageCount=" + pageCount);
+    }
+    if (title.startsWith("Wiktionary:") ||
+        title.startsWith("Template:") ||
+        title.startsWith("Appendix:") ||
+        title.startsWith("Category:") ||
+        title.startsWith("Index:") ||
+        title.startsWith("MediaWiki:") ||
+        title.startsWith("TransWiki:") ||
+        title.startsWith("Citations:") ||
+        title.startsWith("Concordance:") ||
+        title.startsWith("Help:")) {
+      return;
+    }
+    currentDepth = 0;
+    words.clear();
+    currentHeading = null;
+    insidePartOfSpeech = false;
+//    System.err.println("Working on page: " + title);
+    try {
+      WikiParser.parse(textBuilder.toString(), this);
+    } catch (Throwable e) {
+      System.err.println("Failure on page: " + title);
+      e.printStackTrace(System.err); 
+    }
+
+   for (final WikiWord word : words) {
+     word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
+   }  // WikiWord
+   
+  }  // endPage()
+
+
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+
+  /**
+   * Two things can happen:
+   * 
+   * We can be in a ==German== section.  There we will see English definitions.
+   * Each POS should get its own QuickDic entry.  Pretty much everything goes
+   * in.
+   * 
+   * Or we can be in an ==English== section with English definitions
+   * and maybe see translations for languages we care about.
+   * 
+   * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
+   * into separate QuickDic entries, but that's tricky--how do we know when we
+   * found a subsection?  Just ignore anything containing pronunciation and
+   * etymology?
+   * 
+   * How do we decide when to seal the deal on an entry?
+   * 
+   * Would be nice if the parser told us about leaving sections....
+   * 
+   * 
+   */
+
+  String title;
+  String currentHeading;
+  int currentDepth;
+  final List<WikiWord> words = new ArrayList<WikiWord>();
+  WikiWord currentWord;
+  WikiWord.PartOfSpeech currentPartOfSpeech;
+  WikiWord.TranslationSense currentTranslationSense;
+  boolean insidePartOfSpeech;
+  
+  StringBuilder wikiBuilder = null;
+  
+  @Override
+  public void onWikiLink(String[] args) {
+    if (wikiBuilder == null) {
+      return;
+    }
+    wikiBuilder.append(args[args.length - 1]);
+  }
+  
+  // ttbc: translations to be checked.
+  static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
+      "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", 
+      "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", 
+      "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
+  static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
+  static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
+  static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
+
+  @Override
+  public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+    if (positionalArgs.isEmpty()) {
+      // This happens very rarely with special templates.
+      return;
+    }
+    final String name = positionalArgs.get(0);
+    
+    namedArgs.remove("lang");
+    namedArgs.remove("nocat");
+    namedArgs.remove("nocap");
+    namedArgs.remove("sc");
+
+    // Pronunciation
+    if (currentWord != null) {
+      if (name.equals("a")) {
+        // accent tag
+        currentWord.currentPronunciation = new StringBuilder();
+        currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
+        return;
+      }
+      
+      if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
+        namedArgs.remove("lang");
+        for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
+          final String pron = namedArgs.remove("" + i);
+          if (pron != null) {
+            positionalArgs.add(pron);
+          } else {
+            if (i > 10) {
+              break;
+            }
+          }
+        }
+        if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
+          System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
+        }
+        if (currentWord.currentPronunciation == null) {
+          currentWord.currentPronunciation = new StringBuilder();
+          currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
+        }
+        if (currentWord.currentPronunciation.length() > 0) {
+          currentWord.currentPronunciation.append("; ");
+        }
+        for (int i = 1; i < positionalArgs.size(); ++i) {
+          if (i > 1) {
+            currentWord.currentPronunciation.append(",");
+          }
+          final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
+          currentWord.currentPronunciation.append(pron).append("");
+        }
+        currentWord.currentPronunciation.append(" (").append(name).append(")");
+        return;
+      }
+      
+      if (name.equals("qualifier")) {
+        //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString());
+        if (wikiBuilder == null) {
+          return;
+        }
+        wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+        return;
+      }
+      
+      if (name.equals("...")) {
+        // Skipping any elided text for brevity.
+        wikiBuilder.append("...");
+        return;
+      }
+      
+      if (passThroughTemplates.contains(name)) {
+        assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs);
+        wikiBuilder.append(name);
+        return;
+      }
+      
+      if (ignoreTemplates.contains(name)) {
+        return;
+      }
+      
+      if ("Pronunciation".equals(currentHeading)) {
+        System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
+        return;
+      }
+    }  // Pronunciation
+    
+    // Part of speech
+    if (insidePartOfSpeech) {
+      
+      // form of
+      if (name.equals("form of")) {
+        namedArgs.remove("sc");
+        if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
+          System.err.println("Invalid form of.");
+        }
+        final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
+        final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
+        currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
+        return;
+      }
+      
+      // The fallback plan: append the template!
+      if (wikiBuilder != null) {
+        wikiBuilder.append("{");
+        boolean first = true;
+        for (final String arg : positionalArgs) {
+          if (!first) {
+            wikiBuilder.append(", ");
+          }
+          first = false;
+          wikiBuilder.append(arg);
+        }
+        // This one isn't so useful.
+        for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
+          if (!first) {
+            wikiBuilder.append(", ");
+          }
+          first = false;
+          wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
+        }
+        wikiBuilder.append("}");
+      }
+      
+      //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
+      return;
+    }  // Part of speech
+
+    
+    // Translations
+    if (name.equals("trans-top")) {
+      assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title);
+      
+      if (currentPartOfSpeech == null) {
+        assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(),  title); 
+        System.err.println("Assuming last part of speech for non-nested translation section: " + title);
+        currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
+      }
+      
+      currentTranslationSense = new WikiWord.TranslationSense();
+      currentPartOfSpeech.translationSenses.add(currentTranslationSense);
+      if (positionalArgs.size() > 1) {
+        currentTranslationSense.sense = positionalArgs.get(1);
+      }
+      return;
+    }  // Translations
+
+    if (wikiBuilder == null) {
+      return;
+    }    
+    if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
+      assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString());
+      wikiBuilder.append("{");
+      for (int i = 1; i < positionalArgs.size(); ++i) {
+        wikiBuilder.append(i > 1 ? "," : "");
+        wikiBuilder.append(positionalArgs.get(i));
+      }
+      wikiBuilder.append(name).append("}");
+      
+    } else  if (name.equals("p")) {
+      assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty());
+      wikiBuilder.append("pl.");
+
+    } else  if (name.equals("s")) {
+      assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title);
+      wikiBuilder.append("sg.");
+      
+    } else  if (grammarTemplates.contains(name)) {
+      assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+      wikiBuilder.append(name).append(".");
+
+    } else  if (name.equals("l")) {
+      // This template is designed to generate a link to a specific language-section on the target page.
+      wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
+      
+    } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
+      if (positionalArgs.size() > 2) {
+        wikiBuilder.append(positionalArgs.get(2));
+      }
+      for (int i = 3; i < positionalArgs.size(); ++i) {
+        wikiBuilder.append(i == 3 ? " {" : ",");
+        wikiBuilder.append(positionalArgs.get(i));
+        wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
+      }
+      final String transliteration = namedArgs.remove("tr");
+      if (transliteration != null) {
+        wikiBuilder.append(" (").append(transliteration).append(")");
+      }
+      
+    } else  if (name.equals("trreq")) {
+      wikiBuilder.append("{{trreq}}");
+      
+    } else if (name.equals("qualifier")) {
+      //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
+      wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+      
+    } else if (useRemainingArgTemplates.contains(name)) {
+      for (int i = 1; i < positionalArgs.size(); ++i) {
+        if (i != 1) {
+          wikiBuilder.append(", ");
+        }
+        wikiBuilder.append(positionalArgs.get(i));
+      }
+    } else if (ignoreTemplates.contains(name)) {
+      // Do nothing.
+      
+    } else if (name.equals("initialism")) {
+      assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+      wikiBuilder.append("Initialism");
+    } else if (name.equals("abbreviation")) {
+      assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+      wikiBuilder.append("Abbreviation");
+    } else if (name.equals("acronym")) {
+      assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+      wikiBuilder.append("Acronym");
+    } else {
+      if (currentTranslationSense != null) {
+        System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
+      }
+    }
+  }
+
+  @Override
+  public void onText(String text) {
+    if (wikiBuilder != null) {
+      wikiBuilder.append(text);
+      return;
+    }
+  }
+
+  @Override
+  public void onHeadingStart(int depth) {
+    wikiBuilder = new StringBuilder();
+    currentDepth = depth;
+    if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+      currentPartOfSpeech = null;
+      insidePartOfSpeech = false;
+    }
+    if (currentWord != null && depth <= currentWord.depth) {
+      currentWord = null;
+    }
+    
+    currentHeading = null;
+  }
+  
+  @Override
+  public void onHeadingEnd(int depth) {
+    final String name = wikiBuilder.toString().trim();
+    wikiBuilder = null;
+    currentTranslationSense = null;
+    currentHeading = name;
+    
+    final boolean lang0 = langPatterns[0].matcher(name).matches();
+    final boolean lang1 = langPatterns[1].matcher(name).matches();
+    if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
+      currentWord = new WikiWord(title, depth);
+      if (lang0 && lang1) {
+        System.err.println("Word is indexed in both index1 and index2: " + title);
+      }
+      currentWord.language = name;
+      currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
+      words.add(currentWord);
+      return;
+    }
+    
+    if (currentWord == null) {
+      return;
+    }
+    
+    if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+      currentPartOfSpeech = null;
+    }
+    
+    insidePartOfSpeech = false;
+    if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
+      currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
+      currentWord.partsOfSpeech.add(currentPartOfSpeech);
+      insidePartOfSpeech = true;
+      return;
+    }
+    
+    if (name.equals("Translations")) {
+      if (currentWord == null || 
+          !currentWord.language.equals("English") || 
+          currentPartOfSpeech == null) {
+        System.err.println("Unexpected Translations section: " + title);
+        return;
+      }
+      currentTranslationSense = new WikiWord.TranslationSense();
+    }
+    
+  }
+
+  @Override
+  public void onListItemStart(String header, int[] section) {
+    wikiBuilder = new StringBuilder();
+    if (currentWord != null) {
+      currentWord.currentPronunciation = null;
+    }
+  }
+  
+
+  @Override
+  public void onListItemEnd(String header, int[] section) {
+    String item = wikiBuilder.toString().trim();
+    if (item.length() == 0) {
+      return;
+    }
+    item = WikiParser.simpleParse(item);
+    wikiBuilder = null;
+        
+    // Part of speech
+    if (insidePartOfSpeech) {
+      assert currentPartOfSpeech != null : title + item;
+      if (header.equals("#") || 
+          header.equals("##") || 
+          header.equals("###") || 
+          header.equals("####") || 
+          header.equals(":#") || 
+          header.equals("::") ||
+          header.equals(":::*")) {
+        // Definition.
+        // :: should append, probably.
+        currentPartOfSpeech.newMeaning().meaning = item;
+        
+      // Source
+      } else if (header.equals("#*") ||
+                 header.equals("##*") ||
+                 header.equals("###*")) {
+        currentPartOfSpeech.lastMeaning().newExample().source = item;
+        
+      // Example
+      } else if (header.equals("#:") || 
+                 header.equals("#*:") || 
+                 header.equals("#:*") || 
+                 header.equals("##:") || 
+                 header.equals("##*:") || 
+                 header.equals("#:*:") || 
+                 header.equals("#:*#") ||
+                 header.equals("#*:") ||
+                 header.equals("*:") || 
+                 header.equals("#:::") ||
+                 header.equals("#**") ||
+                 header.equals("#*:::") ||
+                 header.equals("#:#") ||
+                 header.equals(":::") ||
+                 header.equals("##:*") ||
+                 header.equals("###*:")) {
+        StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
+        
+      // Example in English
+      } else if (header.equals("#::") || 
+                 header.equals("#*::") || 
+                 header.equals("#:**") ||
+                 header.equals("#*#") ||
+                 header.equals("##*::")) {
+        StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
+        
+      // Skip
+      } else if (header.equals("*") ||
+                 header.equals("**") ||
+                 header.equals("***") || 
+                 header.equals("*#") ||
+                 header.equals(":") ||
+                 header.equals("::*") ||
+                 header.equals("#**") ||
+                 header.equals(":*") ||
+                 header.equals("#*:*") ||
+                 header.equals("#*:**") || 
+                 header.equals("#*:#") || 
+                 header.equals("#*:*:") || 
+                 header.equals("#*:*") || 
+                 header.equals(";")) {
+        // might have: * {{seeCites}}
+        // * [[w:Arabic numerals|Arabic numerals]]: 2
+        //assert item.trim().length() == 0;
+        System.err.println("Skipping meaning: " + header + " " + item);
+      } else {
+        if (title.equals("Yellowknife")) {
+          return;
+        }
+        System.err.println("Busted heading: " + title + "  "+ header + " " + item);
+      }
+      return;
+    }
+    // Part of speech
+    
+    // Translation
+    if (currentTranslationSense != null) {
+      if (item.indexOf("{{[trreq]{}}}") != -1) {
+        return;
+      }
+
+      if (currentPartOfSpeech.translationSenses.isEmpty()) {
+        currentPartOfSpeech.translationSenses.add(currentTranslationSense);
+      }
+
+      final int colonPos = item.indexOf(':');
+      if (colonPos == -1) {
+        System.err.println("Invalid translation: title=" + title +  ",  item=" + item);
+        return;
+      }
+      final String lang = item.substring(0, colonPos);
+      final String trans = item.substring(colonPos + 1).trim();
+      for (int i = 0; i < 2; ++i) {
+        if (langPatterns[i].matcher(lang).find()) {
+          currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
+        }
+      }
+    } // Translation
+  }
+
+  @Override
+  public void onNewLine() {
+  }
+
+  @Override
+  public void onNewParagraph() {
+  }
+
+  // ----------------------------------------------------------------------
+  
+  @Override
+  public void onComment(String text) {
+  }
+
+  @Override
+  public void onFormatBold(boolean boldOn) {
+  }
+
+  @Override
+  public void onFormatItalic(boolean italicOn) {
+  }
+
+  @Override
+  public void onUnterminated(String start, String rest) {
+    System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
+  }
+  @Override
+  public void onInvalidHeaderEnd(String rest) {
+    throw new RuntimeException(rest);
+  }
+
+}
index b8ca6f9cf3500bbebc1a83a5c35488c0f638eb20..1b6aeee129dd6e97c22ff7e0c05f097a69932ce7 100644 (file)
@@ -3,10 +3,12 @@ package com.hughes.android.dictionary.parser;
 public class WikiHeading {
   public final int depth;
   public final String name;
+  public final String prefix;
   
-  public WikiHeading(int depth, String name) {
+  public WikiHeading(int depth, String name, String prefix) {
     this.depth = depth;
     this.name = name;
+    this.prefix = prefix;
   }
 
   public static WikiHeading getHeading(String line) {
@@ -22,7 +24,7 @@ public class WikiHeading {
       System.err.println("Invalid heading: " + line);
       return null;
     }
-    return new WikiHeading(i, line.substring(i, line.length() - i).trim());
+    return new WikiHeading(i, line.substring(i, line.length() - i).trim(), prefix);
   }
   
 }
diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java
new file mode 100644 (file)
index 0000000..d028acb
--- /dev/null
@@ -0,0 +1,206 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public final class WikiTokenizer {
+
+  //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
+  private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|<!--|''|$)", Pattern.MULTILINE);
+  private static final String listChars = "*#:;";
+  
+    
+    final String wikiText;
+    final Matcher matcher;
+
+    boolean justReturnedNewline = true;
+    int end = 0;
+    int start = -1;
+
+    public String header;
+    public int headerDepth;
+    
+    final List<String> tokenStack = new ArrayList<String>();
+    
+  public WikiTokenizer(final String wikiText) {
+    this.wikiText = wikiText;
+    this.matcher = wikiTokenEvent.matcher(wikiText);
+  }
+    
+  private void clear() {
+    header = null;
+    headerDepth = 0;
+    tokenStack.clear();
+  }
+
+
+  public WikiTokenizer nextToken() {
+    this.clear();
+    
+    start = end;
+    
+    final int len = wikiText.length();
+    if (start >= len) {
+      return null;
+    }
+    
+    // Eat a newline if we're looking at one:
+    final boolean atNewline = wikiText.charAt(end) == '\n';
+    if (atNewline) {
+      justReturnedNewline = true;
+      ++end;
+      return this;
+    }
+    
+    if (justReturnedNewline) {
+      final char firstChar = wikiText.charAt(end);
+      if (firstChar == '=') {
+        final int headerStart = end;
+        while (++end < len && wikiText.charAt(end) == '=') {}
+        final int headerTitleStart = end;
+        while (++end < len && wikiText.charAt(end) != '=' && wikiText.charAt(end) != '\n') {}
+        final int headerTitleEnd = end;
+        while (++end < len && wikiText.charAt(end) == '=') {}
+        final int headerEnd = end;
+        
+        return this;
+      }
+      if (listChars.indexOf(firstChar) != -1) {
+        while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
+        end = escapedFind(start, "\n");
+        return this;
+      }
+    }
+    justReturnedNewline = false;
+
+    if (wikiText.startsWith("'''", start)) {
+      end = start + 3;
+      return this;
+    }
+    
+    if (wikiText.startsWith("''", start)) {
+      end = start + 2;
+      return this;
+    }
+
+    if (wikiText.startsWith("[[", start)) {
+      end = escapedFind(start + 2, "]]");
+      return this;
+    }
+
+    if (wikiText.startsWith("{{", start)) {
+      end = escapedFind(start + 2, "}}");
+      return this;
+    }
+
+    if (wikiText.startsWith("<pre>", start)) {
+      end = safeIndexOf(wikiText, start, "</pre>", "\n");
+      return this;
+    }
+
+    if (wikiText.startsWith("<math>", start)) {
+      end = safeIndexOf(wikiText, start, "</math>", "\n");
+      return this;
+    }
+
+    if (wikiText.startsWith("<!--", start)) {
+      end = safeIndexOf(wikiText, start, "-->", "\n");
+      return this;
+    }
+
+    if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
+      System.err.println("Close without open!");
+      end += 2;
+      return this;
+    }
+
+    
+    if (this.matcher.find(start)) {
+      end = this.matcher.start(1);
+      if (end == start) {
+        System.err.println(this.matcher.group());
+        assert false;
+      }
+      return this;
+    }
+    
+    end = wikiText.length();
+    return this;
+    
+  }
+  
+  public String token() {
+    return wikiText.substring(start, end);
+  }
+  
+  private int escapedFind(final int start, final String toFind) {
+    assert tokenStack.isEmpty();
+    
+    int end = start;
+    while (end < wikiText.length()) {
+      if (matcher.find(end)) {
+        final String matchText = matcher.group();
+        final int matchStart = matcher.start();
+        
+        if (matchText.length() == 0) {
+          assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
+          if (tokenStack.isEmpty() && toFind.equals("\n")) {
+            return matchStart;
+          }
+          ++end;
+        } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
+          // The normal return....
+          return matcher.end();
+        } else if (matchText.equals("[[") || matchText.equals("{{")) {
+          tokenStack.add(matchText);
+        } else if (matchText.equals("]]") || matchText.equals("}}")) {
+          if (tokenStack.size() > 0) {
+            final String removed = tokenStack.remove(tokenStack.size() - 1);
+            if (removed.equals("{{") && !matcher.group().equals("}}")) {
+              System.err.println("Unmatched {{ error: " + wikiText.substring(start));
+              return safeIndexOf(wikiText, start, "\n", "\n");
+            } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
+              System.err.println("Unmatched [[ error: " + wikiText.substring(start));
+              return safeIndexOf(wikiText, start, "\n", "\n");
+            }
+          } else {
+            System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n"));
+            // If we were looking for a newline
+            return safeIndexOf(wikiText, start, "\n", "\n");
+          }
+        } else if (matchText.equals("<!--")) {
+          end = wikiText.indexOf("-->");
+          if (end == -1) {
+            System.err.println("Unmatched <!-- error: " + wikiText.substring(start));
+          }
+        } else {
+          assert false : "Match text='" + matchText + "'";
+          throw new IllegalStateException();
+        }
+      } else {
+        // Hmmm, we didn't find the closing symbol we were looking for...
+        System.err.println("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+        return safeIndexOf(wikiText, start, "\n", "\n");
+      }
+      
+      // Inside the while loop.
+      end = Math.max(end, matcher.end());
+    }
+    return end;
+  }
+
+  static int safeIndexOf(final String s, final int start, final String target, final String backup) {
+    int close = s.indexOf(target, start);
+    if (close != -1) {
+      return close + target.length();
+    }
+    close = s.indexOf(backup, start);
+    if (close != -1) {
+      return close + backup.length();
+    }
+    return s.length();
+  }
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java
new file mode 100644 (file)
index 0000000..fd34ab9
--- /dev/null
@@ -0,0 +1,138 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class WikiTokenizerTest extends TestCase {
+  
+  public void testSimple() {
+    final String wikiText =
+      "Hi" + "\n" +
+      "Hello thad you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+      "hi <!--" + "\n" +
+      "multi-line" + "\n" +
+      "# comment -->" + "\n" +
+      "" + "\n" +
+      "asdf\n" +
+      "{{template_not_in_list}}" + "\n" +
+      "# {{template_in_list}}" + "\n" +
+      "[[wikitext]]:[[wikitext]]" + "\n" +  // don't want this to trigger a list
+      ": but this is a list!" + "\n" +
+      "*:* and so is this :::" + "\n" +
+      "here's [[some blah|some]] wikitext." + "\n" +
+      "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
+      "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
+      "== Header 2 ==" + "\n" +
+      "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
+      "{{mismatched]]" + "\n" +
+      "[[mismatched}}" + "\n" +
+      "{extraterminated}}" + "\n" +
+      "[extraterminated]]" + "\n" +
+      "=== {{header-template}} ===" + "\n";
+    
+    final String[] expectedTokens = new String[] {
+        "Hi",
+        "\n",
+        "Hello thad you're ",
+        "<!-- not -->",
+        " ",
+        "'''",
+        "pretty",
+        "'''",
+        " cool ",
+        "'''",
+        "''",
+        "over",
+        "'''",
+        "''",
+        " there.",
+        "\n",
+        "hi ",
+        "<!--\nmulti-line\n# comment -->",
+        "\n",
+        "\n",
+        "asdf",
+        "\n",
+        "{{template_not_in_list}}",
+        "\n",
+        "# {{template_in_list}}",
+        "\n",
+        "[[wikitext]]",
+        ":",
+        "[[wikitext]]",
+        "\n",
+        ": but this is a list!",
+        "\n",
+        "*:* and so is this :::",
+        "\n",
+        "here's ",
+        "[[some blah|some]]",
+        " wikitext.",
+        "\n",
+        "here's a ",
+        "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}",
+        " and some more text.",
+        "\n",
+        "== Header 2 ==",
+        "\n",
+        "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
+        "\n",
+        "{{mismatched]]\n",
+        "[[mismatched}}\n",
+        "{extraterminated",
+        "}}",
+        "\n",
+        "[extraterminated",
+        "]]",
+        "\n",
+        "=== {{header-template}} ===",
+        "\n",
+        };
+    
+    final List<String> actualTokens = new ArrayList<String>();
+    
+    final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
+    WikiTokenizer token;
+    int i = 0;
+    while ((token = wikiTokenizer.nextToken()) != null) {
+      actualTokens.add(token.token());
+      System.out.println("\"" + token.token().replace("\n", "\\n") + "\",");
+      assertEquals(expectedTokens[i++], token.token());
+    }
+    assertEquals(Arrays.asList(expectedTokens), actualTokens);
+  }
+  
+  public void testWikiHeading() {
+    assertNull(WikiHeading.getHeading(""));
+    assertNull(WikiHeading.getHeading("="));
+    assertNull(WikiHeading.getHeading("=="));
+    assertNull(WikiHeading.getHeading("=a"));
+    assertNull(WikiHeading.getHeading("=a=="));
+    assertNull(WikiHeading.getHeading("===a=="));
+    assertNull(WikiHeading.getHeading("===a===="));
+    assertNull(WikiHeading.getHeading("a="));
+    assertEquals("a", WikiHeading.getHeading("=a=").name);
+    assertEquals(1, WikiHeading.getHeading("=a=").depth);
+    assertEquals("aa", WikiHeading.getHeading("==aa==").name);
+    assertEquals(2, WikiHeading.getHeading("==aa==").depth);
+  }
+
+  
+  public void testWikiFunction() {
+    assertNull(WikiFunction.getFunction(""));
+    assertNull(WikiFunction.getFunction("[[asdf]]"));
+    assertNull(WikiFunction.getFunction("asd [[asdf]]asdf "));
+    assertEquals("a", WikiFunction.getFunction("{{a}}").name);
+    assertEquals("a", WikiFunction.getFunction("{{a|b}}").name);
+    assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name);
+    assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name);
+    assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name);
+    assertEquals(Arrays.asList("b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args);
+    assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b"));
+    assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer"));
+  }
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java.old b/src/com/hughes/android/dictionary/parser/WikiWord.java.old
new file mode 100644 (file)
index 0000000..96f3321
--- /dev/null
@@ -0,0 +1,339 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexedEntry;
+import com.hughes.android.dictionary.engine.EntryTypeName;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.PairEntry;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
+import com.hughes.util.ListUtil;
+
+public class WikiWord {
+  final int depth;
+  
+  final String title;
+  String language;
+
+  int index;
+  
+  final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
+  StringBuilder currentPronunciation = null;
+
+  final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
+  
+  public WikiWord(final String title, int depth) {
+    this.title = title.intern();
+    this.depth = depth;
+  }
+
+  static class PartOfSpeech {
+    final int depth;
+    final String name;
+
+    final List<Meaning> meanings = new ArrayList<WikiWord.Meaning>();
+    
+    final List<TranslationSense> translationSenses = new ArrayList<WikiWord.TranslationSense>();
+    
+    final List<FormOf> formOfs = new ArrayList<WikiWord.FormOf>();
+    
+    public PartOfSpeech(final int depth, String name) {
+      this.depth = depth;
+      this.name = name.intern();
+    }
+
+    public Meaning newMeaning() {
+      final Meaning meaning = new Meaning();
+      meanings.add(meaning);
+      return meaning;
+    }
+
+    public Meaning lastMeaning() {
+      return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings);
+    }
+  }
+  
+  static class TranslationSense {
+    String sense;
+    List<List<Translation>> translations = new ArrayList<List<Translation>>();
+    {
+      translations.add(new ArrayList<Translation>());
+      translations.add(new ArrayList<Translation>());
+    }
+  }
+  
+  static class Translation {
+    String language;
+    String text;
+    
+    public Translation(final String language, final String text) {
+      this.language = language;
+      this.text = text;
+    }
+
+    @Override
+    public String toString() {
+      return language + ": " + text;
+    }
+  }
+  
+  static class FormOf {
+    final String grammarForm;
+    final String target;
+    
+    public FormOf(final String grammarForm, final String token) {
+      this.grammarForm = grammarForm;
+      this.target = token;
+    }
+  }
+  
+  static class Meaning {
+    String meaning;
+    final List<Example> examples = new ArrayList<WikiWord.Example>();
+    
+    public Example newExample() {
+      final Example example = new Example();
+      this.examples.add(example);
+      return example;
+    }
+
+    public Example lastExample() {
+      return examples.isEmpty() ? newExample() : ListUtil.getLast(examples);
+    }
+  }
+  
+  static class Example {
+    String source;
+    final StringBuilder example = new StringBuilder();
+    final StringBuilder exampleInEnglish = new StringBuilder();
+  }
+  
+  // -------------------------------------------------------------------------
+  
+  void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) {
+    //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation);
+     if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) {
+       System.err.println("Word with no POS: " + title);
+     }
+     for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
+       partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
+     }  // PartOfSpeech
+
+     // Pronunciation.
+     if (index != -1) {
+       final PairEntry pronEntry = new PairEntry();
+       for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
+         String accent = accentToPron.getKey();
+         if (accent.length() > 0) {
+           accent = accent + ": ";
+         }         
+         pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0));
+       }
+       if (pronEntry.pairs.size() > 0) {
+         final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry);
+         dictBuilder.dictionary.pairEntries.add(pronEntry);
+         final Set<String> tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
+         dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION);
+       }
+     }
+  }
+
+
+  static final Pattern templateName = Pattern.compile("\\{[^,]*,");
+  private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder,
+      final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
+    //System.out.println("  pos: " + partOfSpeech.name);
+         
+     for (final WikiWord.Meaning meaning : partOfSpeech.meanings) {
+       //System.out.println("    meaning: " + meaning.meaning);
+       for (final WikiWord.Example example : meaning.examples) {
+         if (example.example.length() > 0) {
+           //System.out.println("      example: " + example.example);
+         }
+         if (example.exampleInEnglish.length() > 0) {
+           //System.out.println("      exampleInEnglish: " + example.exampleInEnglish);
+         }
+       }
+     }
+     
+     if (index != -1) {
+       final boolean formOfSwap = index != 0;
+       for (final FormOf formOf : partOfSpeech.formOfs) {
+         final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
+         final PairEntry pairEntry = new PairEntry();
+         pairEntry.pairs.add(pair);
+         final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+         dictBuilder.dictionary.pairEntries.add(pairEntry);
+  
+         // File under title token.
+         final Set<String> tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR);
+         dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF);
+       }
+     }
+
+     
+     if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) {
+       final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")";
+       final boolean swap = enIndexBuilder == 1;
+     
+       // Meanings.
+       for (final Meaning meaning : partOfSpeech.meanings) {
+         final PairEntry pairEntry = new PairEntry();
+         final List<Pair> pairs = pairEntry.pairs;
+
+         final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
+         exampleTokens.add(new LinkedHashSet<String>());
+         exampleTokens.add(new LinkedHashSet<String>());
+         
+         if (meaning.meaning != null && meaning.meaning.length() > 0) {
+           final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap);
+           pairs.add(meaningPair);
+         } else {
+           System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name);
+         }
+           
+         // Examples
+         for (final Example example : meaning.examples) {
+           final int dashIndex = example.example.indexOf("—");
+           if (example.exampleInEnglish.length() == 0 && dashIndex != -1) {
+             System.out.println("Splitting example: title=" + title + ", "+ example.example);
+             example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim());
+             example.example.delete(dashIndex, example.example.length());
+           }
+           
+           if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) {
+             final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap);
+             pairs.add(pair);
+             
+             for (int i = 0; i < 2; ++i) {
+               exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR));
+             }
+           }
+         }
+
+         // Create EntryData with the PairEntry.
+         final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+         dictBuilder.dictionary.pairEntries.add(pairEntry);
+
+         // File under title token.
+         final Set<String> titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
+         dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD);
+       
+         // File under the meaning tokens (English):
+         if (meaning.meaning != null) {
+           // If the meaning contains any templates, strip out the template name
+           // so we don't index it.
+           final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll("");
+           final Set<String> meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR);
+           dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD);
+         }
+         
+         // File under other tokens that we saw.
+         for (int i = 0; i < 2; ++i) {
+           dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS);
+         }         
+       
+         
+       }  // Meanings.
+       
+     }
+     
+     translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
+  }
+
+
+  private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder,
+      final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
+    if (!partOfSpeech.translationSenses.isEmpty()) {
+       if (!language.equals("English")) {
+         System.err.println("Translation sections not in English.");
+       }
+       
+       final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
+       
+       for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
+         //System.out.println("    sense: " + translationSense.sense);
+         if (translationSense.sense == null) {
+           //System.err.println("    null sense: " + title);
+         }
+         String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
+         
+         final StringBuilder[] sideBuilders = new StringBuilder[2];
+         final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
+         for (int i = 0; i < 2; ++i) {
+           sideBuilders[i] = new StringBuilder();
+           sideTokens.add(new LinkedHashMap<EntryTypeName, List<String>>());
+         }
+         
+         if (enIndexBuilder != -1) {
+           sideBuilders[enIndexBuilder].append(englishSense);
+           addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD);
+         }
+         
+         // Get the entries from the translation section.
+         for (int i = 0; i < 2; ++i) {
+           //System.out.println("      lang: " + i);
+           for (final Translation translation : translationSense.translations.get(i)) {
+             //System.out.println("        translation: " + translation);
+             sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : "");
+             if (translationSense.translations.get(i).size() > 1) {
+               sideBuilders[i].append(translation.language).append(": ");
+             }
+             sideBuilders[i].append(translation.text);
+             
+             // TODO: Don't index {m}, {f}
+             // TODO: Don't even show: (1), (1-2), etc.
+             addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD);
+           }
+         }
+
+         // Construct the Translations-based QuickDic entry for this TranslationSense.
+         if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
+           final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
+           final PairEntry pairEntry = new PairEntry();
+           pairEntry.pairs.add(pair);
+           final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+           dictBuilder.dictionary.pairEntries.add(pairEntry);
+           
+           // Add the EntryData to the indices under the correct tokens.
+           for (int i = 0; i < 2; ++i) {
+             final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i);
+             for (final Map.Entry<EntryTypeName, List<String>> entry : sideTokens.get(i).entrySet()) {
+               for (final String token : entry.getValue()) {
+                 final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, entry.getKey());
+                 entries.add(entryData);
+               }
+             }
+
+           }             
+           
+         }
+       }  // Senses
+     }  // Translations
+  }
+
+  
+  static void addTokens(final String text, final Map<EntryTypeName, List<String>> map,
+      EntryTypeName entryTypeName) {
+    final Set<String> tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR);
+    if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) {
+      entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD;
+    }
+    List<String> tokenList = map.get(entryTypeName);
+    if (tokenList == null) {
+      tokenList = new ArrayList<String>();
+      map.put(entryTypeName, tokenList);
+    }
+    tokenList.addAll(tokens);
+  }
+
+
+
+}