]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
go
authorThad Hughes <thad.hughes@gmail.com>
Fri, 5 Nov 2010 19:27:18 +0000 (12:27 -0700)
committerThad Hughes <thad.hughes@gmail.com>
Tue, 13 Dec 2011 18:39:03 +0000 (10:39 -0800)
14 files changed:
src/com/hughes/android/dictionary/WiktionaryXmlParser.java [deleted file]
src/com/hughes/android/dictionary/engine/DictionaryBuilder.java
src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java [new file with mode: 0644]
src/com/hughes/android/dictionary/engine/DictionaryTest.java
src/com/hughes/android/dictionary/engine/EntryData.java
src/com/hughes/android/dictionary/engine/IndexBuilder.java
src/com/hughes/android/dictionary/engine/NormalizeComparator.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/DictFileParser.java [moved from src/com/hughes/android/dictionary/engine/DictFileParser.java with 94% similarity]
src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiCallback.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiParser.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiParserTest.java [new file with mode: 0644]
src/com/hughes/android/dictionary/parser/WikiWord.java [new file with mode: 0644]

diff --git a/src/com/hughes/android/dictionary/WiktionaryXmlParser.java b/src/com/hughes/android/dictionary/WiktionaryXmlParser.java
deleted file mode 100644 (file)
index 31d8c92..0000000
+++ /dev/null
@@ -1,231 +0,0 @@
-package com.hughes.android.dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Pattern;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-import com.hughes.android.dictionary.engine.Dictionary;
-import com.hughes.util.MapUtil;
-import com.hughes.util.StringUtil;
-
-public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
-
-  final Dictionary dict;
-
-  StringBuilder titleBuilder;
-  StringBuilder textBuilder;
-  StringBuilder currentBuilder = null;
-
-  public WiktionaryXmlParser(final Dictionary dict) {
-    this.dict = dict;
-  }
-
-  @Override
-  public void startElement(String uri, String localName, String qName,
-      Attributes attributes) {
-    currentBuilder = null;
-    if ("page".equals(qName)) {
-      titleBuilder = new StringBuilder();
-      textBuilder = new StringBuilder();
-    } else if ("title".equals(qName)) {
-      currentBuilder = titleBuilder;
-    } else if ("text".equals(qName)) {
-      currentBuilder = textBuilder;
-    }
-  }
-
-  @Override
-  public void characters(char[] ch, int start, int length) throws SAXException {
-    if (currentBuilder != null) {
-      currentBuilder.append(ch, start, length);
-    }
-  }
-
-  @Override
-  public void endElement(String uri, String localName, String qName)
-      throws SAXException {
-    currentBuilder = null;
-    if ("page".equals(qName)) {
-      endPage();
-    }
-  }
-
-  private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
-
-  // MULTILINE for ^
-  private static final Pattern SECTION_HEADER = Pattern
-      .compile("=== *\\{\\{Wortart\\|");
-
-  private static final Pattern WORTART_DELIM = Pattern.compile("===",
-      Pattern.LITERAL);
-  private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
-
-  private static final Pattern WIKI_QUOTE = Pattern.compile("''",
-      Pattern.LITERAL);
-  private static final Pattern WIKI_DOUBLE_BRACE = Pattern
-      .compile("\\{\\{([^}]+)\\}\\}");
-  private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
-      .compile("\\[\\[([^\\]]+)\\]\\]");
-  private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE);
-
-  enum Field {
-    Wortart("Wortart", null),
-
-    Aussprache("Aussprache", null),
-
-    Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")),
-
-    Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")),
-
-    Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")),
-
-    Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")),
-
-    Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")),
-
-    Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")),
-
-    Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")),
-
-    Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")),
-
-    CharakteristischeWortkombinationen("Charakteristische Wortkombinationen",
-        Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")),
-
-    AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern
-        .compile("\\{\\{Abgeleitete Begriffe\\}\\}")),
-
-    Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")),
-    
-    Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")),
-    
-    ;
-
-    final String name;
-    final Pattern listPattern;
-
-    Field(final String name, final Pattern listPattern) {
-      this.name = name;
-      this.listPattern = listPattern;
-    }
-  }
-
-  private static final Pattern WORTART = Pattern
-      .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
-  private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
-      Pattern.LITERAL);
-
-  private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
-
-  private void endPage() {
-
-    StringBuilder text = textBuilder;
-    text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
-    text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
-
-    // Remove comments.
-    StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
-        Pattern.compile("-->", Pattern.LITERAL));
-
-    String sectionString;
-    while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
-        SECTION_HEADER, false)) != null) {
-      final StringBuilder section = new StringBuilder(sectionString);
-
-      String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
-          true);
-      if (wortart.contains("\n") || !wortart.contains("eutsch")) {
-        MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
-            AtomicInteger.class).incrementAndGet();
-        continue;
-      }
-
-      final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
-
-      wortart = wortart.replaceAll("===", "");
-      wortart = WORTART.matcher(wortart).replaceAll("$1");
-      wortart = GENDER.matcher(wortart).replaceAll("{$1}");
-      wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
-      wortart = wortart.replaceAll("Wortart\\|", "");
-      wortart = wortart.trim();
-      fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
-
-      String aussprache = StringUtil
-          .remove(section, AUSSPRACHE, NEWLINE, false);
-      if (aussprache != null) {
-        aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
-        aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
-        aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", "");
-        aussprache = aussprache.trim();
-        fieldToValue.put(Field.Aussprache, Collections
-            .singletonList(aussprache));
-      }
-
-      for (final Field field : Field.values()) {
-        if (field.listPattern != null) {
-          fieldToValue.put(field, extractList(section, field.listPattern));
-        }
-      }
-
-      System.out.println(titleBuilder);
-      for (final Field field : Field.values()) {
-        if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) {
-          fieldToValue.remove(field);
-        } else {
-          if (field.name != null) {
-//            System.out.println(field.name);
-//            for (final String line : fieldToValue.get(field)) {
-//              System.out.println("  " + line);
-//            }
-          }
-        }
-      }
-//      System.out.println("WHAT'S LEFT:");
-//      System.out.println(section);
-//      System.out.println("------------------------------------------------");
-
-    }
-
-  }
-
-  private List<String> extractList(final StringBuilder section,
-      final Pattern start) {
-    final List<String> result = new ArrayList<String>();
-    final String linesString = StringUtil.remove(section, start,
-        WIKI_NEW_SECTION, false);
-    if (linesString != null) {
-      String[] lines = linesString.split("\n");
-      for (int i = 1; i < lines.length; ++i) {
-        String bedeutung = lines[i];
-        bedeutung = bedeutung.replaceFirst("^:+", "");
-        bedeutung = bedeutung.trim();
-        if (bedeutung.length() > 0) {
-          result.add(bedeutung);
-        }
-      }
-    }
-    return result;
-  }
-
-  void parse(final File file) throws ParserConfigurationException,
-      SAXException, IOException {
-    final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
-    parser.parse(file, this);
-    System.out.println(errorCounts);
-  }
-
-}
index 6bb1115e29e648ec0acea9a1b47028574bf997ca..7ea0d91647a0c0bfa41e70eee792d19aba25445e 100644 (file)
@@ -8,7 +8,14 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;
 
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.parser.DictFileParser;
+import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser;
 import com.hughes.util.Args;
 import com.hughes.util.FileUtil;
 
@@ -37,14 +44,13 @@ import com.hughes.util.FileUtil;
 
 public class DictionaryBuilder {
   
-  final Dictionary dictionary;
-  
-  final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
+  public final Dictionary dictionary;
+  public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
   
-  public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) {
+  public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2) {
     dictionary = new Dictionary(dictInfo);
-    indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, false));
-    indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, true));
+    indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, false));
+    indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, true));
   }
   
   void build() {
@@ -54,7 +60,7 @@ public class DictionaryBuilder {
     }
   }
   
-  public static void main(final String[] args) throws IOException {
+  public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
     final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
     
     final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
@@ -63,6 +69,15 @@ public class DictionaryBuilder {
       fatalError("--lang1= and --lang2= must both be specified.");
     }
     
+    String normalizerRules1 = keyValueArgs.remove("normalizerRules1");
+    String normalizerRules2 = keyValueArgs.remove("normalizerRules2");
+    if (normalizerRules1 == null) {
+      normalizerRules1 = lang1.getDefaultNormalizerRules();
+    }
+    if (normalizerRules2 == null) {
+      normalizerRules2 = lang2.getDefaultNormalizerRules();
+    }
+    
     final String dictOutFilename = keyValueArgs.remove("dictOut");
     if (dictOutFilename == null) {
       fatalError("--dictOut= must be specified.");
@@ -80,10 +95,12 @@ public class DictionaryBuilder {
     
     System.out.println("lang1=" + lang1);
     System.out.println("lang2=" + lang2);
+    System.out.println("normalizerRules1=" + normalizerRules1);
+    System.out.println("normalizerRules2=" + normalizerRules2);
     System.out.println("dictInfo=" + dictInfo);
     System.out.println("dictOut=" + dictOutFilename);    
     
-    final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2);
+    final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2);
     
     for (int i = 0; i < 100; ++i) {
       final String prefix = "input" + i;
@@ -105,9 +122,15 @@ public class DictionaryBuilder {
           new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
         } else if ("chemnitz".equals(inputFormat)) {
           new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
-        } else if ("wiktionary".equals(inputFormat)) {
-          throw new RuntimeException();
-//          new WiktionaryXmlParser(dict).parse(file);
+        } else if ("enwiktionary".equals(inputFormat)) {
+          final Pattern[] translationPatterns = new Pattern[2];
+          translationPatterns[0] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern1"));
+          translationPatterns[1] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern2"));
+          final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1;
+          if (enIndex < 0 || enIndex >= 2) {
+            fatalError("Must be 1 or 2: " + prefix + "EnIndex");
+          }
+          new EnWiktionaryXmlParser(dictionaryBuilder, translationPatterns, enIndex).parse(file);
         } else {
           fatalError("Invalid or missing input format: " + inputFormat);
         }
index e68bf5e29aceccb23427e2c7260ff3affacd46fe..7c8d2324ec15e664fb27ffd4953327fa105c57fc 100644 (file)
@@ -11,8 +11,8 @@ import junit.framework.TestCase;
 
 public class DictionaryBuilderTest extends TestCase {
   
-  public void testGermanCombined() throws IOException {
-    final File result = new File("testdata/de_en.dict");
+  public void testGermanCombined() throws Exception {
+    final File result = new File("testdata/de-en.quickdic");
     System.out.println("Writing to: " + result);
     DictionaryBuilder.main(new String[] {
         "--dictOut=" + result.getAbsolutePath(),
@@ -20,30 +20,37 @@ public class DictionaryBuilderTest extends TestCase {
         "--lang2=EN",
         "--dictInfo=@testdata/de-en_dictInfo.txt",
 
-        "--input1=testdata/de-en_chemnitz_100",
-        "--input1Name=dictcc",
-        "--input1Charset=UTF8",
-        "--input1Format=chemnitz",
+//        "--input1=testdata/de-en_chemnitz_100",
+//        "--input1Name=dictcc",
+//        "--input1Charset=UTF8",
+//        "--input1Format=chemnitz",
+//
+//        "--input2=testdata/de-en_dictcc_100",
+//        "--input2Name=dictcc",
+//        "--input2Charset=UTF8",
+//        "--input2Format=dictcc",
+
+        "--input3=testdata/enwiktionary_small.xml",
+        "--input3Name=enwiktionary",
+        "--input3Format=enwiktionary",
+        "--input3TranslationPattern1=[Gg]erman",
+        "--input3TranslationPattern2=[Ee]glish",
+        "--input3EnIndex=2",
 
-        "--input2=testdata/de-en_dictcc_100",
-        "--input2Name=dictcc",
-        "--input2Charset=UTF8",
-        "--input2Format=dictcc",
-        
         "--print=testdata/de-en.test",
     });
     
     // Check it once:
-    assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); 
+    assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test"); 
     
     
     // Check it again.
     final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
-    final PrintStream out = new PrintStream(new File("testdata/de_en.test"));
+    final PrintStream out = new PrintStream(new File("testdata/de-en.test"));
     dict.print(out);
     out.close();
     
-    assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+    assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test");
   }
   
 
diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java
new file mode 100644 (file)
index 0000000..878a24f
--- /dev/null
@@ -0,0 +1,40 @@
+package com.hughes.android.dictionary.engine;
+
+import junit.framework.TestCase;
+
+public class DictionaryBuilder_DE extends TestCase {
+  
+  public static void main(final String[] args) throws Exception {
+    
+    DictionaryBuilder.main(new String[] {
+        "--dictOut=dictOutputs/de-en_chemnitz.quickdic",
+        "--lang1=DE",
+        "--lang2=EN",
+        "--dictInfo=@dictInputs/de-en_chemnitz.info",
+
+        "--input1=dictInputs/de-en_chemnitz.txt",
+        "--input1Name=dictcc",
+        "--input1Charset=UTF8",
+        "--input1Format=chemnitz",
+    });
+
+    DictionaryBuilder.main(new String[] {
+        "--dictOut=dictOutputs/de-en_all.quickdic",
+        "--lang1=DE",
+        "--lang2=EN",
+        "--dictInfo=@dictInputs/de-en_all.info",
+
+        "--input1=dictInputs/de-en_chemnitz.txt",
+        "--input1Name=dictcc",
+        "--input1Charset=UTF8",
+        "--input1Format=chemnitz",
+
+        "--input2=dictInputs/de-en_dictcc.txt",
+        "--input2Name=dictcc",
+        "--input2Charset=UTF8",
+        "--input2Format=dictcc",
+    });
+
+  }
+  
+}
index 4b45348ae8e6ae342f3338c675548dbf2450148c..388e71d5a069ec066b4ea0c9e5b50c8321e76dc2 100644 (file)
@@ -10,13 +10,14 @@ import java.util.concurrent.atomic.AtomicBoolean;
 
 import junit.framework.TestCase;
 
-import com.hughes.android.dictionary.engine.Index.SearchResult;
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+import com.ibm.icu.text.Transliterator;
 
 
 public class DictionaryTest extends TestCase {
     
   public void testGermanMetadata() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r");
     final Dictionary dict = new Dictionary(raf);
     final Index deIndex = dict.indices.get(0);
     
@@ -33,55 +34,51 @@ public class DictionaryTest extends TestCase {
     
     for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) {
       System.out.println("testing: " + indexEntry.token);
-      final Index.SearchResult searchResult = deIndex.findLongestSubstring(indexEntry.token, new AtomicBoolean(
+      final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean(
           false));
-      assertEquals(indexEntry.token.toLowerCase(), searchResult.insertionPoint.token.toLowerCase());
-      assertEquals(indexEntry.token.toLowerCase(), searchResult.longestPrefix.token.toLowerCase());
+      assertEquals(indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
     }
 
     // TODO: maybe if user types capitalization, use it.
-    assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aaac", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAC", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAc", new AtomicBoolean(false)));
-    assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aAac", new AtomicBoolean(false)));
+    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false)));
+    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false)));
+    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false)));
+    assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false)));
 
     // Before the beginning.
-    assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("", new AtomicBoolean(false)));
-    assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("__", new AtomicBoolean(false)));
+    assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false)));
+    assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false)));
     
     // After the end.
-    assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findLongestSubstring("ZZZZZ", new AtomicBoolean(false)));
-
-    assertSearchResult("ab", "aaac", deIndex.findLongestSubstring("aaaca", new AtomicBoolean(false)));
-    assertSearchResult("machen", "machen", deIndex.findLongestSubstring("m", new AtomicBoolean(false)));
+    assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false)));
 
-    assertFalse(deIndex.findLongestSubstring("macdddd", new AtomicBoolean(false)).success);
+    assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false)));
+    assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false)));
+    assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false)));
 
 
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberprüfe", new AtomicBoolean(false)));
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpruefe", new AtomicBoolean(false)));
+    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false)));
+    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false)));
 
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpBLEH", new AtomicBoolean(false)));
-    assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("überprBLEH", new AtomicBoolean(false)));
+    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false)));
+    assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false)));
 
-    assertSearchResult("überprüfen", "überprüfe", deIndex.findLongestSubstring("überprüfeBLEH", new AtomicBoolean(false)));
+    assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false)));
 
     // Check that search in lowercase works.
-    assertSearchResult("Alibi", "Alibi", deIndex.findLongestSubstring("alib", new AtomicBoolean(false)));
-    assertTrue(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).success);
-    System.out.println(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).toString());
+    assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
+    System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString());
     
     raf.close();
   }
   
   private void assertSearchResult(final String insertionPoint, final String longestPrefix,
-      final SearchResult actual) {
-    assertEquals(insertionPoint, actual.insertionPoint.token);
-    assertEquals(longestPrefix, actual.longestPrefix.token);
+      final IndexEntry actual) {
+    assertEquals(insertionPoint, actual.token);
   }
 
   public void testGermanTokenRows() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r");
     final Dictionary dict = new Dictionary(raf);
     final Index deIndex = dict.indices.get(0);
     
@@ -112,7 +109,8 @@ public class DictionaryTest extends TestCase {
   }
   
   public void testGermanSort() {
-    assertEquals("aüÄÄ", Language.de.textNorm("aueAeAE", false));
+    final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
+    assertEquals("aüääss", normalizer.transform("aueAeAEß"));
     final List<String> words = Arrays.asList(
         "er-ben",
         "erben",
@@ -129,32 +127,34 @@ public class DictionaryTest extends TestCase {
         "Großformats",
         "Großpoo",
         "Großpoos",
+        "Hörvermögen",
         "Hörweite",
         "hos",
         "Höschen",
         "Hostel",
         "hulle",
         "Hulle",
-        "hülle",
         "huelle",
-        "Hülle",
         "Huelle",
+        "hülle",
+        "Hülle",
+        "Huellen",
+        "Hüllen",
         "Hum"
         );
-    assertEquals(0, Language.de.sortComparator.compare("hülle", "huelle"));
-    assertEquals(0, Language.de.sortComparator.compare("huelle", "hülle"));
+    final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator());
+    assertEquals(1, comparator.compare("hülle", "huelle"));
+    assertEquals(-1, comparator.compare("huelle", "hülle"));
     
-    assertEquals(-1, Language.de.sortComparator.compare("hülle", "Hülle"));
-    assertEquals(0, Language.de.findComparator.compare("hülle", "Hülle"));
-    assertEquals(-1, Language.de.findComparator.compare("hulle", "Hülle"));
+    assertEquals(-1, comparator.compare("hülle", "Hülle"));
+    
+    assertEquals("hülle", normalizer.transform("Hülle"));
+    assertEquals("hulle", normalizer.transform("Hulle"));
 
     
-    for (final String s : words) {
-      System.out.println(s + "\t" + Language.de.textNorm(s, false));
-    }
     final List<String> sorted = new ArrayList<String>(words);
 //    Collections.shuffle(shuffled, new Random(0));
-    Collections.sort(sorted, Language.de.sortComparator);
+    Collections.sort(sorted, comparator);
     System.out.println(sorted.toString());
     for (int i = 0; i < words.size(); ++i) {
       System.out.println(words.get(i) + "\t" + sorted.get(i));
@@ -162,8 +162,8 @@ public class DictionaryTest extends TestCase {
     }
   }
 
-  @SuppressWarnings("unchecked")
   public void testEnglishSort() {
+    final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD);
 
     final List<String> words = Arrays.asList(
         "pre-print", 
@@ -172,16 +172,17 @@ public class DictionaryTest extends TestCase {
         "preprocess");
     
     final List<String> sorted = new ArrayList<String>(words);
-    Collections.sort(sorted, Language.en.getSortCollator());
+    final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator());
+    Collections.sort(sorted, comparator);
     for (int i = 0; i < words.size(); ++i) {
       if (i > 0) {
-        assertTrue(Language.en.getSortCollator().compare(words.get(i-1), words.get(i)) < 0);
+        assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
       }
       System.out.println(words.get(i) + "\t" + sorted.get(i));
       assertEquals(words.get(i), sorted.get(i));
     }
     
-    assertTrue(Language.en.getSortCollator().compare("pre-print", "preppy") < 0);
+    assertTrue(comparator.compare("pre-print", "preppy") < 0);
 
   }
   
@@ -192,17 +193,24 @@ public class DictionaryTest extends TestCase {
   }
 
   public void testTextNorm() {
-    assertEquals("hoschen", "Höschen".toLowerCase(Language.de.locale));
+    //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD);
+    final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD);
+    assertEquals("hoschen", transliterator.transliterate("Höschen"));
+    assertEquals("hoschen", transliterator.transliterate("Hoeschen"));
+    assertEquals("grosspoo", transliterator.transliterate("Großpoo"));
+
+    assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
+    assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
+    assertEquals("biologiceskom", transliterator.transliterate("биологическом"));
   }
 
   public void testChemnitz() throws IOException {
-    final RandomAccessFile raf = new RandomAccessFile("testdata/de-en_chemnitz.dict", "r");
+    final RandomAccessFile raf = new RandomAccessFile("dictOutputs/de-en_chemnitz.quickdic", "r");
     final Dictionary dict = new Dictionary(raf);
     final Index deIndex = dict.indices.get(0);
     
-    //assertSearchResult("Höschen", "Hos", deIndex.findLongestSubstring("Hos", new AtomicBoolean(false)));
-    //assertSearchResult("Höschen", "hos", deIndex.findLongestSubstring("hos", new AtomicBoolean(false)));
+    assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false)));
+    assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false)));
 
     raf.close();
   }
index 7f6b9b5fa2227e07589c5f2c181aaa9c2be5586b..19521f22a4ee76f19daa9b7ce4aa7f11637291c7 100644 (file)
@@ -5,8 +5,8 @@ package com.hughes.android.dictionary.engine;
 
 import com.hughes.util.IndexedObject;
 
-class EntryData extends IndexedObject {
-  EntryData(final int index, final Entry entry) {
+public class EntryData extends IndexedObject {
+  public EntryData(final int index, final Entry entry) {
     super(index);
     this.entry = entry;
   }
index 0e25e3388b6dbbed2070dd7f43325188a92e75f7..0d6a3d938f87d8293756a0b1b9966e1459da85c9 100644 (file)
@@ -17,15 +17,14 @@ import com.hughes.android.dictionary.engine.Index.IndexEntry;
 public class IndexBuilder {
   
   final DictionaryBuilder dictionaryBuilder;
-  final Index index;
+  public final Index index;
 
   final SortedMap<String, TokenData> tokenToData;
 
-  @SuppressWarnings("unchecked")
-  IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final boolean swapPairEntries) {
+  IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) {
     this.dictionaryBuilder = dictionaryBuilder;
-    index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, swapPairEntries);
-    tokenToData = new TreeMap<String, TokenData>(language.getSortCollator());
+    index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries);
+    tokenToData = new TreeMap<String, TokenData>(new NormalizeComparator(index.normalizer, language.collator));
   }
   
   public void build() {
diff --git a/src/com/hughes/android/dictionary/engine/NormalizeComparator.java b/src/com/hughes/android/dictionary/engine/NormalizeComparator.java
new file mode 100644 (file)
index 0000000..d25c5a4
--- /dev/null
@@ -0,0 +1,29 @@
+package com.hughes.android.dictionary.engine;
+
+import java.util.Comparator;
+
+import com.ibm.icu.text.Transliterator;
+
+public class NormalizeComparator implements Comparator<String> {
+  
+  final Transliterator normalizer;
+  final Comparator<Object> comparator;
+
+  public NormalizeComparator(final Transliterator normalizer,
+      final Comparator<Object> comparator) {
+    this.normalizer = normalizer;
+    this.comparator = comparator;
+  }
+
+  @Override
+  public int compare(final String s1, final String s2) {
+    final String n1 = normalizer.transform(s1);
+    final String n2 = normalizer.transform(s2);
+    final int cn = comparator.compare(n1, n2);
+    if (cn != 0) {
+      return cn;
+    }
+    return comparator.compare(s1, s2);
+  }
+
+}
similarity index 94%
rename from src/com/hughes/android/dictionary/engine/DictFileParser.java
rename to src/com/hughes/android/dictionary/parser/DictFileParser.java
index ebdbaefb727fd9c338042c582aa3145ced4d27af..1e01ae2bf9b2e4be43b44c400137a5649af8061c 100644 (file)
@@ -1,4 +1,4 @@
-package com.hughes.android.dictionary.engine;
+package com.hughes.android.dictionary.parser;
 
 import java.io.BufferedReader;
 import java.io.File;
@@ -11,6 +11,12 @@ import java.util.logging.Logger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.EntryData;
+import com.hughes.android.dictionary.engine.EntryTypeName;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.Language;
+import com.hughes.android.dictionary.engine.PairEntry;
 import com.hughes.android.dictionary.engine.PairEntry.Pair;
 
 public class DictFileParser {
@@ -18,11 +24,11 @@ public class DictFileParser {
   static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
 
   // Dictcc
-  static final Pattern TAB = Pattern.compile("\\t");
+  public static final Pattern TAB = Pattern.compile("\\t");
 
   // Chemnitz
-  static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
-  static final Pattern PIPE = Pattern.compile("\\|");
+  public static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
+  public static final Pattern PIPE = Pattern.compile("\\|");
   
   static final Pattern SPACES = Pattern.compile("\\s+");
   static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java
new file mode 100644 (file)
index 0000000..677b5ee
--- /dev/null
@@ -0,0 +1,272 @@
+package com.hughes.android.dictionary.parser;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+
+public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+
+  final DictionaryBuilder dict;
+  
+  final IndexBuilder[] indexBuilders;
+  final Pattern[] langPatterns;
+
+  StringBuilder titleBuilder;
+  StringBuilder textBuilder;
+  StringBuilder currentBuilder = null;
+
+  public EnWiktionaryXmlParser(final DictionaryBuilder builder, final Pattern[] langPatterns, final int enIndexBuilder) {
+    assert langPatterns.length == 2;
+    this.dict = builder;
+    this.indexBuilders = dict.indexBuilders.toArray(new IndexBuilder[0]);
+    this.langPatterns = langPatterns;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName,
+      Attributes attributes) {
+    currentBuilder = null;
+    if ("page".equals(qName)) {
+      titleBuilder = new StringBuilder();
+      
+      // Start with "\n" to better match certain strings.
+      textBuilder = new StringBuilder("\n");
+    } else if ("title".equals(qName)) {
+      currentBuilder = titleBuilder;
+    } else if ("text".equals(qName)) {
+      currentBuilder = textBuilder;
+    }
+  }
+
+  @Override
+  public void characters(char[] ch, int start, int length) throws SAXException {
+    if (currentBuilder != null) {
+      currentBuilder.append(ch, start, length);
+    }
+  }
+
+  @Override
+  public void endElement(String uri, String localName, String qName)
+      throws SAXException {
+    currentBuilder = null;
+    if ("page".equals(qName)) {
+      endPage();
+    }
+  }
+  
+
+  public void parse(final File file) throws ParserConfigurationException,
+      SAXException, IOException {
+    final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+    parser.parse(file, this);
+  }
+  
+  private void endPage() {
+    title = titleBuilder.toString();
+    currentDepth = 0;
+    words.clear();
+    WikiParser.parse(textBuilder.toString(), this);
+  }
+  
+  /**
+   * Two things can happen:
+   * 
+   * We can be in a ==German== section.  There we will see English definitions.
+   * Each POS should get its own QuickDic entry.  Pretty much everything goes
+   * in.
+   * 
+   * Or we can be in an ==English== section with English definitions
+   * and maybe see translations for languages we care about.
+   * 
+   * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
+   * into separate QuickDic entries, but that's tricky--how do we know when we
+   * found a subsection?  Just ignore anything containing pronunciation and
+   * etymology?
+   * 
+   * How do we decide when to seal the deal on an entry?
+   * 
+   * Would be nice if the parser told us about leaving sections....
+   * 
+   * 
+   */
+
+  String title;
+  int currentDepth;
+  final List<WikiWord> words = new ArrayList<WikiWord>();
+  WikiWord currentWord;
+  WikiWord.PartOfSpeech currentPartOfSpeech;
+  WikiWord.TranslationSection currentTranslationSection;
+  
+  StringBuilder wikiBuilder = null;
+  
+  // ------------------------------------------------------------------------
+
+  @Override
+  public void onWikiLink(String[] args) {
+    if (wikiBuilder != null) {
+      wikiBuilder.append(args[args.length - 1]);
+    }
+  }
+
+  @Override
+  public void onTemplate(String[][] args) {
+    final String name = args[0][1];
+    if (name == "") {
+      
+    } else {
+      //System.out.println("Unhandled template: " + name);
+    }
+  }
+
+  @Override
+  public void onText(String text) {
+    if (wikiBuilder != null) {
+      wikiBuilder.append(text);
+      return;
+    }
+  }
+
+  @Override
+  public void onHeadingStart(int depth) {
+    wikiBuilder = new StringBuilder();
+    currentDepth = depth;
+    if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+      currentPartOfSpeech = null;
+    }
+    if (currentWord != null && depth <= currentWord.depth) {
+      currentWord = null;
+    }
+  }
+  
+  final Pattern partOfSpeechHeader = Pattern.compile(
+      "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+      "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+      "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+      "Ligature|Idiom|Phrase|" +
+      // These are @deprecated:
+      "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+      "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
+
+  @Override
+  public void onHeadingEnd(int depth) {
+    final String name = wikiBuilder.toString().trim();
+    wikiBuilder = null;
+    
+    final boolean lang1 = langPatterns[0].matcher(name).matches();
+    final boolean lang2 = langPatterns[1].matcher(name).matches();
+    if (name.equalsIgnoreCase("English") || lang1 || lang2) {
+      currentWord = new WikiWord(depth);
+      currentWord.language = name;
+      currentWord.isLang1 = lang1;
+      currentWord.isLang2 = lang2;
+      words.add(currentWord);
+      return;
+    }
+    
+    if (currentWord == null) {
+      return;
+    }
+    
+    if (partOfSpeechHeader.matcher(name).matches()) {
+      currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
+      currentWord.partsOfSpeech.add(currentPartOfSpeech);
+      return;
+    }
+    
+    if (name.equals("Translations")) {
+      if (currentWord == null || 
+          !currentWord.language.equals("English") || 
+          currentPartOfSpeech == null) {
+        System.out.println("Unexpected Translations section: " + title);
+        return;
+      }
+      currentTranslationSection = new WikiWord.TranslationSection();
+      currentPartOfSpeech.translationSections.add(currentTranslationSection);
+    } else {
+      currentTranslationSection = null;
+    }
+  }
+
+  @Override
+  public void onListItemStart(String header, int[] section) {
+    wikiBuilder = new StringBuilder();
+  }
+  
+
+  @Override
+  public void onListItemEnd(String header, int[] section) {
+    final String item = wikiBuilder.toString();
+    wikiBuilder = null;
+    
+    if (currentTranslationSection != null) {
+      final int colonPos = item.indexOf(':');
+      if (colonPos == -1) {
+        System.out.println("Invalid translation: " + item);
+        return;
+      }
+      final String lang = item.substring(0, colonPos);
+      final String trans = item.substring(colonPos + 1);
+      for (int i = 0; i < 2; ++i) {
+        if (langPatterns[i].matcher(lang).find()) {
+          currentTranslationSection.translations.get(i).add(trans);
+        }
+      }
+    }
+  }
+
+  @Override
+  public void onNewLine() {
+  }
+
+  @Override
+  public void onNewParagraph() {
+  }
+
+  // ----------------------------------------------------------------------
+  
+  public void onTransTrop(final String[][] args) {
+    currentTranslationSection = new WikiWord.TranslationSection();
+    currentPartOfSpeech.translationSections.add(currentTranslationSection);
+    
+    if (args.length > 1) {
+      currentTranslationSection.sense = args[1][1];
+    }
+  }
+
+  
+  // ----------------------------------------------------------------------
+
+  @Override
+  public void onComment(String text) {
+  }
+
+  @Override
+  public void onFormatBold(boolean boldOn) {
+  }
+
+  @Override
+  public void onFormatItalic(boolean italicOn) {
+  }
+
+  @Override
+  public void onUnterminated(String start, String rest) {
+    throw new RuntimeException(rest);
+  }
+  @Override
+  public void onInvalidHeaderEnd(String rest) {
+    throw new RuntimeException(rest);
+  }
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiCallback.java b/src/com/hughes/android/dictionary/parser/WikiCallback.java
new file mode 100644 (file)
index 0000000..44865cc
--- /dev/null
@@ -0,0 +1,33 @@
+package com.hughes.android.dictionary.parser;
+
+
+public interface WikiCallback {
+
+  void onComment(final String text);
+
+  void onFormatBold(final boolean boldOn);
+  void onFormatItalic(final boolean italicOn);
+
+  void onWikiLink(final String[] args);
+
+  void onTemplate(final String[][] args);
+
+  // Will never contain a newline unless it's in a <pre>
+  void onText(final String text);
+
+  // Only at start of line.
+  void onHeadingStart(final int depth);
+  void onHeadingEnd(final int depth);
+  
+  
+  void onNewLine();
+  void onNewParagraph();
+
+  void onListItemStart(final String header, final int[] section);
+  void onListItemEnd(final String header, final int[] section);
+
+  // Errors
+  void onUnterminated(final String start, String rest);
+  void onInvalidHeaderEnd(String rest);
+  
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiParser.java b/src/com/hughes/android/dictionary/parser/WikiParser.java
new file mode 100644 (file)
index 0000000..84dc770
--- /dev/null
@@ -0,0 +1,128 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WikiParser {
+  
+  private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|^[*#;:]+|^(==+)\\s*|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
+  private static final Pattern listStart = Pattern.compile("^[*#;:]");
+  private static final Pattern pipeSplit = Pattern.compile("\\s*\\|\\s*");
+  private static final Pattern whitespace = Pattern.compile("\\s+");
+  
+  static void parse(final String wikiText, final WikiCallback callback) {
+    
+    boolean boldOn = false;
+    boolean italicOn = false;
+    int insideHeaderDepth = -1;
+    String lastListItem = null;
+    
+    String rest = wikiText;
+    while (rest.length() > 0) {
+      final Matcher matcher = markup.matcher(rest);
+      if (matcher.find()) {
+        final int nextMarkupPos = matcher.start();
+        if (nextMarkupPos != 0) {
+          String text = rest.substring(0, nextMarkupPos);
+          whitespace.matcher(text).replaceAll(" ");
+          callback.onText(text);
+        }
+        rest = rest.substring(nextMarkupPos);
+        
+        if (rest.startsWith("\n")) {
+          if (insideHeaderDepth != -1) {
+            throw new RuntimeException("barf");
+          }
+          if (lastListItem != null) {
+            callback.onListItemEnd(lastListItem, null);
+          }
+          if (!listStart.matcher(rest.substring(1)).matches()) {
+            lastListItem = null;
+          }
+          if (rest.startsWith("\n\n")) {
+            // TODO(thadh): eat all the newlines.
+            callback.onNewParagraph();
+            rest = rest.substring(2); 
+          } else {
+            callback.onNewLine();
+            rest = rest.substring(1);
+          }
+        } else if (rest.startsWith("'''")) {
+          boldOn = !boldOn;
+          callback.onFormatBold(boldOn);
+          rest = rest.substring(3);
+        } else if (rest.startsWith("''")) {
+          italicOn = !italicOn;
+          callback.onFormatItalic(italicOn);
+          rest = rest.substring(2);
+        } else if (rest.startsWith("{{")) {
+          int end = rest.indexOf("}}");
+          if (end == -1) {
+            callback.onUnterminated("{{", rest);
+            return;
+          }
+          final String template = rest.substring(2, end).trim();
+          final String[] templateArray = pipeSplit.split(template);
+          final String[][] templateArgs = new String[templateArray.length][];
+          for (int i = 0; i < templateArray.length; ++i) {
+            int equalPos = templateArray[i].indexOf('=');
+            if (equalPos == -1) {
+              templateArgs[i] = new String[] { null, templateArray[i] };
+            } else {
+              templateArgs[i] = new String[] { templateArray[i].substring(0, equalPos), templateArray[i].substring(equalPos + 1) };
+            }
+          }
+          callback.onTemplate(templateArgs);
+          rest = rest.substring(end + 2);
+        } else if (rest.startsWith("[[")) {
+          int end = rest.indexOf("]]");
+          if (end == -1) {
+            callback.onUnterminated("[[", rest);
+            return;
+          }
+          final String wikiLink = rest.substring(2, end);
+          final String[] args = pipeSplit.split(wikiLink);
+          callback.onWikiLink(args);
+          rest = rest.substring(end + 2);
+        } else if (rest.startsWith("=")) {
+          final String match = matcher.group(1) != null ? matcher.group(1) : matcher.group(2);
+          if (insideHeaderDepth == -1) {
+            insideHeaderDepth = match.length();            
+            callback.onHeadingStart(insideHeaderDepth);
+          } else {
+            if (match.length() != insideHeaderDepth) {
+              callback.onInvalidHeaderEnd(rest);
+              return;
+            }
+            callback.onHeadingEnd(insideHeaderDepth);
+            insideHeaderDepth = -1;
+          }
+          rest = rest.substring(match.length());
+        } else if (rest.startsWith("*") || rest.startsWith("#") || rest.startsWith(";") || rest.startsWith(":")) {
+          lastListItem = matcher.group();
+          callback.onListItemStart(lastListItem, null);
+          rest = rest.substring(lastListItem.length());
+        } else if (rest.startsWith("<!--")) {
+          int end = rest.indexOf("-->");
+          if (end == -1) {
+            callback.onUnterminated("<!--", rest);
+            return;
+          }
+          callback.onComment(rest.substring(4, end));
+          rest = rest.substring(end + 3);
+        } else if (rest.startsWith("<pre>")) {
+          int end = rest.indexOf("</pre>");
+          if (end == -1) {
+            callback.onUnterminated("<pre>", rest);
+            return;
+          }
+          callback.onText(rest.substring(5, end));
+          rest = rest.substring(end + 6);
+        } else {
+          throw new RuntimeException("barf!");
+        }
+      }  // matcher.find()
+    }
+  }
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiParserTest.java b/src/com/hughes/android/dictionary/parser/WikiParserTest.java
new file mode 100644 (file)
index 0000000..a8d4be8
--- /dev/null
@@ -0,0 +1,132 @@
+package com.hughes.android.dictionary.parser;
+
+import junit.framework.TestCase;
+
+public class WikiParserTest extends TestCase {
+  
+  public void testSimple() {
+    final String text =
+      "Hi" + "\n" +
+      "Hello ''thad'' you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+      "hi <!--" + "\n" +
+      "multi-line" + "\n" +
+      "# comment -->" + "\n" +
+      "" + "\n" +
+      "# li" + "\n" +
+      "# li2" + "\n" +
+      "## li2.2" + "\n" +
+      "Hi again." + "\n" +
+      "here's [[some blah|some]] wikitext." + "\n" +
+      "here's a {{template|blah=2|blah2=3|" + "\n" +
+      "blah3=3}} and some more text." + "\n" +
+      "== Header 2 ==" + "\n" +
+      "=== {{header-template}} ===" + "\n";
+    
+    final String expected = "Hi Hello <i>thad</i> you're \n" +
+        "comment: not \n" +
+        " <b>pretty</b> cool <b><i>over</b></i> there. hi \n" +
+        "comment:\n" +
+        "multi-line\n" +
+        "# comment \n" +
+        "\n" +
+        "\n" +
+        "# li\n" +
+        " # li2\n" +
+        " ## li2.2\n" +
+        " Hi again. here's [[some]] wikitext. here's a \n" +
+        "template:template\n" +
+        " and some more text. \n" +
+        "HEADER   Header 2 \n" +
+        " \n" +
+        "HEADER    \n" +
+        "template:header-template\n" +
+        " \n" +
+        " ";
+    final PrintWikiCallback callback = new PrintWikiCallback();
+    WikiParser.parse(text, callback);
+    assertEquals(expected, callback.builder.toString());
+    
+  }
+  
+  
+  static final class PrintWikiCallback implements WikiCallback {
+    final StringBuilder builder = new StringBuilder();
+
+    @Override
+    public void onComment(String text) {
+      builder.append("\ncomment:").append(text).append("\n");
+    }
+
+    @Override
+    public void onFormatBold(boolean boldOn) {
+      builder.append(boldOn ? "<b>" : "</b>");
+    }
+
+    @Override
+    public void onFormatItalic(boolean italicOn) {
+      builder.append(italicOn ? "<i>" : "</i>");
+    }
+
+    @Override
+    public void onWikiLink(String[] args) {
+      builder.append("[[").append(args[args.length - 1]).append("]]");
+    }
+
+    @Override
+    public void onTemplate(String[][] args) {
+      builder.append("\ntemplate:").append(args[0][0]).append("\n");
+    }
+
+    @Override
+    public void onText(String text) {
+      builder.append(text);
+    }
+
+    @Override
+    public void onHeadingStart(int depth) {
+      builder.append("\nHEADER");
+      for (int i = 0; i < depth; ++i) {
+        builder.append(" ");
+      }
+    }
+
+    @Override
+    public void onHeadingEnd(int depth) {
+      builder.append("\n");
+    }
+    
+    @Override
+    public void onNewLine() {
+      builder.append(" ");
+    }
+
+    @Override
+    public void onNewParagraph() {
+      builder.append("\n\n");
+    }
+
+    @Override
+    public void onListItemStart(String header, int[] section) {
+      builder.append(header);
+    }
+
+    @Override
+    public void onListItemEnd(String header, int[] section) {
+      builder.append("\n");
+    }
+
+    @Override
+    public void onUnterminated(String start, String rest) {
+      throw new RuntimeException("bad");
+    }
+
+    @Override
+    public void onInvalidHeaderEnd(String rest) {
+      throw new RuntimeException("bad");
+    }
+    
+  }
+  
+
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java b/src/com/hughes/android/dictionary/parser/WikiWord.java
new file mode 100644 (file)
index 0000000..49806d2
--- /dev/null
@@ -0,0 +1,58 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class WikiWord {
+  final int depth;
+  
+  String language;
+  String pronunciation;
+
+  boolean isLang1;
+  boolean isLang2;
+  
+  final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
+  
+  final Map<String, List<String>> otherSections = new LinkedHashMap<String, List<String>>();
+  
+  public WikiWord(int depth) {
+    this.depth = depth;
+  }
+
+  static class PartOfSpeech {
+    final int depth;
+
+    final List<Meaning> meaning = new ArrayList<WikiWord.Meaning>();
+    
+    final List<TranslationSection> translationSections = new ArrayList<WikiWord.TranslationSection>();
+        
+    final Map<String, String> otherSections = new LinkedHashMap<String, String>();
+
+    public PartOfSpeech(final int depth) {
+      this.depth = depth;
+    }
+  }
+  
+  static class TranslationSection {
+    String sense;
+    List<List<String>> translations = new ArrayList<List<String>>();
+    {
+      translations.add(new ArrayList<String>());
+      translations.add(new ArrayList<String>());
+    }
+  }
+  
+  static class Meaning {
+    String meaning;
+    Example example;
+  }
+  
+  static class Example {
+    String example;
+    String exampleInEnglish;
+  }
+
+}