From f602bd837028813c3e2d7cc1456a3b7b6a8bb53e Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Thu, 9 Dec 2010 15:37:44 -0800 Subject: [PATCH] go --- .../engine/DictionaryBuilderTest.java | 61 ++++-- .../parser/EnWiktionaryXmlParser.java | 193 +++++++++++++++--- .../dictionary/parser/WikiCallback.java | 5 +- .../android/dictionary/parser/WikiParser.java | 59 ++++-- .../dictionary/parser/WikiParserTest.java | 31 ++- .../android/dictionary/parser/WikiWord.java | 8 +- 6 files changed, 274 insertions(+), 83 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 7c8d232..903f327 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -11,32 +11,59 @@ import junit.framework.TestCase; public class DictionaryBuilderTest extends TestCase { - public void testGermanCombined() throws Exception { - final File result = new File("testdata/de-en.quickdic"); + public void testWiktionaryCombined() throws Exception { + final File result = new File("testdata/wiktionary.quickdic"); System.out.println("Writing to: " + result); DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), "--lang1=DE", "--lang2=EN", - "--dictInfo=@testdata/de-en_dictInfo.txt", - -// "--input1=testdata/de-en_chemnitz_100", -// "--input1Name=dictcc", -// "--input1Charset=UTF8", -// "--input1Format=chemnitz", -// -// "--input2=testdata/de-en_dictcc_100", -// "--input2Name=dictcc", -// "--input2Charset=UTF8", -// "--input2Format=dictcc", + "--dictInfo=SomeWikiData", "--input3=testdata/enwiktionary_small.xml", "--input3Name=enwiktionary", "--input3Format=enwiktionary", - "--input3TranslationPattern1=[Gg]erman", - "--input3TranslationPattern2=[Ee]glish", + "--input3TranslationPattern1=German|Italian|Spanish|French|Japanese|Arabic|Mandarin", + "--input3TranslationPattern2=English", "--input3EnIndex=2", + "--print=testdata/wiktionary.test", + }); + + // Check it once: + assertFilesEqual("testdata/wiktionary.golden", "testdata/wiktionary.test"); + + + // Check it again. + final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r")); + final PrintStream out = new PrintStream(new File("testdata/wiktionary.test")); + dict.print(out); + out.close(); + + assertFilesEqual("testdata/wiktionary.golden", "testdata/wiktionary.test"); + } + + + public void testGermanCombined() throws Exception { + if (1==1) throw new RuntimeException(); + final File result = new File("testdata/de-en.quickdic"); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@testdata/de-en_dictInfo.txt", + + "--input1=testdata/de-en_chemnitz_100", + "--input1Name=dictcc", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + + "--input2=testdata/de-en_dictcc_100", + "--input2Name=dictcc", + "--input2Charset=UTF8", + "--input2Format=dictcc", + "--print=testdata/de-en.test", }); @@ -52,7 +79,8 @@ public class DictionaryBuilderTest extends TestCase { assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test"); } - + + void assertFilesEqual(final String expected, final String actual) throws IOException { final String expectedString = FileUtil.readToString(new File(expected)); @@ -60,4 +88,5 @@ public class DictionaryBuilderTest extends TestCase { assertEquals(expectedString, actualString); } + } diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 677b5ee..51d63c8 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -3,7 +3,11 @@ package com.hughes.android.dictionary.parser; import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; @@ -15,8 +19,21 @@ import org.xml.sax.SAXException; import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.parser.WikiWord.TranslationSection; public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback { + + static final Pattern partOfSpeechHeader = Pattern.compile( + "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + + "Ligature|Idiom|Phrase|" + + // These are @deprecated: + "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb"); + + static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+"); + final DictionaryBuilder dict; @@ -77,9 +94,31 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im title = titleBuilder.toString(); currentDepth = 0; words.clear(); + currentHeading = null; WikiParser.parse(textBuilder.toString(), this); + + for (final WikiWord word : words) { + System.out.println("\n" + title + ", " + word.language + ", pron=" + word.accentToPronunciation); + if (word.partsOfSpeech.isEmpty() && title.indexOf(":") == -1) { + System.err.println("Word with no POS: " + title); + } + for (final WikiWord.PartOfSpeech partOfSpeech : word.partsOfSpeech) { + System.out.println(" pos: " + partOfSpeech.name); + + for (final TranslationSection translationSection : partOfSpeech.translationSections) { + System.out.println(" sense: " + translationSection.sense); + + } + } + } } + + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + /** * Two things can happen: * @@ -103,6 +142,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im */ String title; + String currentHeading; int currentDepth; final List words = new ArrayList(); WikiWord currentWord; @@ -111,22 +151,111 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im StringBuilder wikiBuilder = null; - // ------------------------------------------------------------------------ - @Override public void onWikiLink(String[] args) { - if (wikiBuilder != null) { - wikiBuilder.append(args[args.length - 1]); + if (wikiBuilder == null) { + return; } + wikiBuilder.append(args[args.length - 1]); } + + // ttbc: translations to be checked. + static final Set useRemainingArgTemplates = new LinkedHashSet(Arrays.asList( + "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", + "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", + "zh-tsp", "zh-zh-p")); + static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("")); + static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf")); @Override - public void onTemplate(String[][] args) { - final String name = args[0][1]; + public void onTemplate(final List positionalArgs, final Map namedArgs) { + final String name = positionalArgs.get(0); + + // Pronunciation + if (name.equals("a")) { + // accent tag + currentWord.currentPronunciation = new StringBuilder(); + currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation); + return; + } + if (name.equals("IPA") || name.equals("SAMPA") || name.equals("enPR") || name.equals("rhymes")) { + namedArgs.remove("lang"); + assert positionalArgs.size() >= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); + if (currentWord.currentPronunciation == null) { + currentWord.currentPronunciation = new StringBuilder(); + currentWord.accentToPronunciation.put("", currentWord.currentPronunciation); + } + currentWord.currentPronunciation.append(name).append(": "); + for (int i = 1; i < positionalArgs.size(); ++i) { + if (i > 1) { + currentWord.currentPronunciation.append(", "); + } + final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll(""); + currentWord.currentPronunciation.append(pron).append(""); + } + return; + } + if (name.equals("audio")) { + return; + } + if ("Pronunciation".equals(currentHeading)) { + System.err.println("Unhandled template: " + name); + } + + // Translations + if (name.equals("trans-top")) { + assert positionalArgs.size() == 2 && namedArgs.isEmpty(); + currentTranslationSection = new WikiWord.TranslationSection(); + currentPartOfSpeech.translationSections.add(currentTranslationSection); + if (positionalArgs.size() > 1) { + currentTranslationSection.sense = positionalArgs.get(1); + } + return; + } + + if (wikiBuilder == null) { + return; + } if (name == "") { - + } else if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) { + wikiBuilder.append("{").append(name).append("}"); + } else if (name.equals("p")) { + wikiBuilder.append("pl."); + } else if (name.equals("s")) { + wikiBuilder.append("sg."); + } else if (grammarTemplates.contains(name)) { + wikiBuilder.append(name).append("."); + } else if (name.equals("l")) { + wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2)); + } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) { + if (positionalArgs.size() >= 2) { + wikiBuilder.append(positionalArgs.get(1)); + } + if (positionalArgs.size() >= 3) { + wikiBuilder.append(" {").append(positionalArgs.get(1)).append("}"); + } + final String transliteration = namedArgs.remove("tr"); + if (transliteration != null) { + wikiBuilder.append(" (").append(transliteration).append(")"); + } + } else if (name.equals("trreq")) { + wikiBuilder.append("{{trreq}}"); + } else if (name.equals("qualifier")) { + wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); + } else if (useRemainingArgTemplates.contains(name)) { + for (int i = 1; i < positionalArgs.size(); ++i) { + if (i != 1) { + wikiBuilder.append(", "); + } + wikiBuilder.append(positionalArgs.get(i)); + } + } else if (ignoreTemplates.contains(name)) { + } else if (name.equals("initialism")) { + wikiBuilder.append("Initialism"); } else { - //System.out.println("Unhandled template: " + name); + if (currentTranslationSection != null) { + System.err.println("Unhandled template: " + name); + } } } @@ -150,19 +279,12 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im } } - final Pattern partOfSpeechHeader = Pattern.compile( - "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + - "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + - "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + - // These are @deprecated: - "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + - "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb"); - @Override public void onHeadingEnd(int depth) { final String name = wikiBuilder.toString().trim(); wikiBuilder = null; + currentTranslationSection = null; + currentHeading = name; final boolean lang1 = langPatterns[0].matcher(name).matches(); final boolean lang2 = langPatterns[1].matcher(name).matches(); @@ -180,7 +302,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im } if (partOfSpeechHeader.matcher(name).matches()) { - currentPartOfSpeech = new WikiWord.PartOfSpeech(depth); + currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name); currentWord.partsOfSpeech.add(currentPartOfSpeech); return; } @@ -194,14 +316,27 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im } currentTranslationSection = new WikiWord.TranslationSection(); currentPartOfSpeech.translationSections.add(currentTranslationSection); - } else { - currentTranslationSection = null; } + + if (name.equals("Translations")) { + if (currentWord == null || + !currentWord.language.equals("English") || + currentPartOfSpeech == null) { + System.out.println("Unexpected Translations section: " + title); + return; + } + currentTranslationSection = new WikiWord.TranslationSection(); + currentPartOfSpeech.translationSections.add(currentTranslationSection); + } + } @Override public void onListItemStart(String header, int[] section) { wikiBuilder = new StringBuilder(); + if (currentWord != null) { + currentWord.currentPronunciation = null; + } } @@ -210,10 +345,14 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im final String item = wikiBuilder.toString(); wikiBuilder = null; + if (item.indexOf("{{trreq}}") != -1) { + return; + } + if (currentTranslationSection != null) { final int colonPos = item.indexOf(':'); if (colonPos == -1) { - System.out.println("Invalid translation: " + item); + System.err.println("Invalid translation: " + item); return; } final String lang = item.substring(0, colonPos); @@ -236,18 +375,6 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im // ---------------------------------------------------------------------- - public void onTransTrop(final String[][] args) { - currentTranslationSection = new WikiWord.TranslationSection(); - currentPartOfSpeech.translationSections.add(currentTranslationSection); - - if (args.length > 1) { - currentTranslationSection.sense = args[1][1]; - } - } - - - // ---------------------------------------------------------------------- - @Override public void onComment(String text) { } diff --git a/src/com/hughes/android/dictionary/parser/WikiCallback.java b/src/com/hughes/android/dictionary/parser/WikiCallback.java index 44865cc..ad00975 100644 --- a/src/com/hughes/android/dictionary/parser/WikiCallback.java +++ b/src/com/hughes/android/dictionary/parser/WikiCallback.java @@ -1,5 +1,8 @@ package com.hughes.android.dictionary.parser; +import java.util.List; +import java.util.Map; + public interface WikiCallback { @@ -10,7 +13,7 @@ public interface WikiCallback { void onWikiLink(final String[] args); - void onTemplate(final String[][] args); + void onTemplate(final List positionalArgs, final Map namedArgs); // Will never contain a newline unless it's in a
   void onText(final String text);
diff --git a/src/com/hughes/android/dictionary/parser/WikiParser.java b/src/com/hughes/android/dictionary/parser/WikiParser.java
index 84dc770..8dd4864 100644
--- a/src/com/hughes/android/dictionary/parser/WikiParser.java
+++ b/src/com/hughes/android/dictionary/parser/WikiParser.java
@@ -1,14 +1,20 @@
 package com.hughes.android.dictionary.parser;
 
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 public class WikiParser {
   
-  private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|^[*#;:]+|^(==+)\\s*|(==+)\\s*$|");
           if (end == -1) {
diff --git a/src/com/hughes/android/dictionary/parser/WikiParserTest.java b/src/com/hughes/android/dictionary/parser/WikiParserTest.java
index a8d4be8..922b0bc 100644
--- a/src/com/hughes/android/dictionary/parser/WikiParserTest.java
+++ b/src/com/hughes/android/dictionary/parser/WikiParserTest.java
@@ -1,5 +1,8 @@
 package com.hughes.android.dictionary.parser;
 
+import java.util.List;
+import java.util.Map;
+
 import junit.framework.TestCase;
 
 public class WikiParserTest extends TestCase {
@@ -12,14 +15,18 @@ public class WikiParserTest extends TestCase {
       "multi-line" + "\n" +
       "# comment -->" + "\n" +
       "" + "\n" +
+      "asdf\n" + 
       "# li" + "\n" +
       "# li2" + "\n" +
       "## li2.2" + "\n" +
       "Hi again." + "\n" +
+      "[[wikitext]]:[[wikitext]]" + "\n" +  // don't want this to trigger a list
       "here's [[some blah|some]] wikitext." + "\n" +
       "here's a {{template|blah=2|blah2=3|" + "\n" +
       "blah3=3}} and some more text." + "\n" +
       "== Header 2 ==" + "\n" +
+//      "==== Header 4 ====" + "\n" +
+//      "===== Header 5 =====" + "\n" +
       "=== {{header-template}} ===" + "\n";
     
     final String expected = "Hi Hello thad you're \n" +
@@ -30,16 +37,18 @@ public class WikiParserTest extends TestCase {
         "# comment \n" +
         "\n" +
         "\n" +
+        " asdf\n" +
         "# li\n" +
-        " # li2\n" +
-        " ## li2.2\n" +
-        " Hi again. here's [[some]] wikitext. here's a \n" +
-        "template:template\n" +
-        " and some more text. \n" +
+        "# li2\n" +
+        "## li2.2\n" +
+        "\n" +
+        " Hi again. [[wikitext]]:[[wikitext]] here's [[some]] wikitext. here's a \n" +
+        "template:[template]{blah=2, blah2=3, blah3=3}\n" +
+        " and some more text.\n" +
         "HEADER   Header 2 \n" +
-        " \n" +
+        "\n" +
         "HEADER    \n" +
-        "template:header-template\n" +
+        "template:[header-template]{}\n" +
         " \n" +
         " ";
     final PrintWikiCallback callback = new PrintWikiCallback();
@@ -73,8 +82,8 @@ public class WikiParserTest extends TestCase {
     }
 
     @Override
-    public void onTemplate(String[][] args) {
-      builder.append("\ntemplate:").append(args[0][0]).append("\n");
+    public void onTemplate(final List positionalArgs, final Map namedArgs) {
+      builder.append("\ntemplate:").append(positionalArgs).append(namedArgs).append("\n");
     }
 
     @Override
@@ -107,12 +116,12 @@ public class WikiParserTest extends TestCase {
 
     @Override
     public void onListItemStart(String header, int[] section) {
-      builder.append(header);
+      builder.append("\n").append(header);
     }
 
     @Override
     public void onListItemEnd(String header, int[] section) {
-      builder.append("\n");
+      //builder.append("\n");
     }
 
     @Override
diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java b/src/com/hughes/android/dictionary/parser/WikiWord.java
index 49806d2..0a1a32b 100644
--- a/src/com/hughes/android/dictionary/parser/WikiWord.java
+++ b/src/com/hughes/android/dictionary/parser/WikiWord.java
@@ -9,7 +9,9 @@ public class WikiWord {
   final int depth;
   
   String language;
-  String pronunciation;
+  
+  final Map accentToPronunciation = new LinkedHashMap();
+  StringBuilder currentPronunciation = null;
 
   boolean isLang1;
   boolean isLang2;
@@ -24,6 +26,7 @@ public class WikiWord {
 
   static class PartOfSpeech {
     final int depth;
+    final String name;
 
     final List meaning = new ArrayList();
     
@@ -31,8 +34,9 @@ public class WikiWord {
         
     final Map otherSections = new LinkedHashMap();
 
-    public PartOfSpeech(final int depth) {
+    public PartOfSpeech(final int depth, String name) {
       this.depth = depth;
+      this.name = name;
     }
   }
   
-- 
2.43.0