]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java
go
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / EnWiktionaryXmlParser.java
index 677b5eef215750274b70886aa5618d60f794f708..51d63c8025a2744816ff4bdd1339e2f5c489d258 100644 (file)
@@ -3,7 +3,11 @@ package com.hughes.android.dictionary.parser;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.regex.Pattern;
 
 import javax.xml.parsers.ParserConfigurationException;
@@ -15,8 +19,21 @@ import org.xml.sax.SAXException;
 
 import com.hughes.android.dictionary.engine.DictionaryBuilder;
 import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiWord.TranslationSection;
 
 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+  
+  static final Pattern partOfSpeechHeader = Pattern.compile(
+      "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+      "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+      "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+      "Ligature|Idiom|Phrase|" +
+      // These are @deprecated:
+      "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+      "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
+
+  static final Pattern wikiMarkup =  Pattern.compile("\\[\\[|\\]\\]|''+");
+
 
   final DictionaryBuilder dict;
   
@@ -77,9 +94,31 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
     title = titleBuilder.toString();
     currentDepth = 0;
     words.clear();
+    currentHeading = null;
     WikiParser.parse(textBuilder.toString(), this);
+
+   for (final WikiWord word : words) {
+     System.out.println("\n" + title + ", " + word.language + ", pron=" + word.accentToPronunciation);
+     if (word.partsOfSpeech.isEmpty() && title.indexOf(":") == -1) {
+       System.err.println("Word with no POS: " + title);
+     }
+     for (final WikiWord.PartOfSpeech partOfSpeech : word.partsOfSpeech) {
+       System.out.println("  pos: " + partOfSpeech.name);
+       
+       for (final TranslationSection translationSection : partOfSpeech.translationSections) {
+         System.out.println("    sense: " + translationSection.sense);
+         
+       }
+     }
+   }
   }
+
   
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+  // ------------------------------------------------------------------------
+
   /**
    * Two things can happen:
    * 
@@ -103,6 +142,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
    */
 
   String title;
+  String currentHeading;
   int currentDepth;
   final List<WikiWord> words = new ArrayList<WikiWord>();
   WikiWord currentWord;
@@ -111,22 +151,111 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
   
   StringBuilder wikiBuilder = null;
   
-  // ------------------------------------------------------------------------
-
   @Override
   public void onWikiLink(String[] args) {
-    if (wikiBuilder != null) {
-      wikiBuilder.append(args[args.length - 1]);
+    if (wikiBuilder == null) {
+      return;
     }
+    wikiBuilder.append(args[args.length - 1]);
   }
+  
+  // ttbc: translations to be checked.
+  static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
+      "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", 
+      "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", 
+      "zh-tsp", "zh-zh-p"));
+  static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
+  static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
 
   @Override
-  public void onTemplate(String[][] args) {
-    final String name = args[0][1];
+  public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+    final String name = positionalArgs.get(0);
+
+    // Pronunciation
+    if (name.equals("a")) {
+      // accent tag
+      currentWord.currentPronunciation = new StringBuilder();
+      currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
+      return;
+    }
+    if (name.equals("IPA") || name.equals("SAMPA") || name.equals("enPR") || name.equals("rhymes")) {
+      namedArgs.remove("lang");
+      assert positionalArgs.size() >= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); 
+      if (currentWord.currentPronunciation == null) {
+        currentWord.currentPronunciation = new StringBuilder();
+        currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
+      }
+      currentWord.currentPronunciation.append(name).append(": ");
+      for (int i = 1; i < positionalArgs.size(); ++i) {
+        if (i > 1) {
+          currentWord.currentPronunciation.append(", ");
+        }
+        final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
+        currentWord.currentPronunciation.append(pron).append("");
+      }
+      return;
+    }
+    if (name.equals("audio")) {
+      return;
+    }
+    if ("Pronunciation".equals(currentHeading)) {
+      System.err.println("Unhandled template: " + name);
+    }
+
+    // Translations
+    if (name.equals("trans-top")) {
+      assert positionalArgs.size() == 2 && namedArgs.isEmpty();
+      currentTranslationSection = new WikiWord.TranslationSection();
+      currentPartOfSpeech.translationSections.add(currentTranslationSection);
+      if (positionalArgs.size() > 1) {
+        currentTranslationSection.sense = positionalArgs.get(1);
+      }
+      return;
+    }
+
+    if (wikiBuilder == null) {
+      return;
+    }    
     if (name == "") {
-      
+    } else  if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
+      wikiBuilder.append("{").append(name).append("}");
+    } else  if (name.equals("p")) {
+      wikiBuilder.append("pl.");
+    } else  if (name.equals("s")) {
+      wikiBuilder.append("sg.");
+    } else  if (grammarTemplates.contains(name)) {
+      wikiBuilder.append(name).append(".");
+    } else  if (name.equals("l")) {
+      wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
+    } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
+      if (positionalArgs.size() >= 2) {
+        wikiBuilder.append(positionalArgs.get(1));
+      }
+      if (positionalArgs.size() >= 3) {
+        wikiBuilder.append(" {").append(positionalArgs.get(1)).append("}");
+      }
+      final String transliteration = namedArgs.remove("tr");
+      if (transliteration != null) {
+        wikiBuilder.append(" (").append(transliteration).append(")");
+      }
+    } else  if (name.equals("trreq")) {
+      wikiBuilder.append("{{trreq}}");
+    } else if (name.equals("qualifier")) {
+      wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+    } else if (useRemainingArgTemplates.contains(name)) {
+      for (int i = 1; i < positionalArgs.size(); ++i) {
+        if (i != 1) {
+          wikiBuilder.append(", ");
+        }
+        wikiBuilder.append(positionalArgs.get(i));
+      }
+    } else if (ignoreTemplates.contains(name)) {
+    } else if (name.equals("initialism")) {
+      wikiBuilder.append("Initialism");
     } else {
-      //System.out.println("Unhandled template: " + name);
+      if (currentTranslationSection != null) {
+        System.err.println("Unhandled template: " + name);
+      }
     }
   }
 
@@ -150,19 +279,12 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
     }
   }
   
-  final Pattern partOfSpeechHeader = Pattern.compile(
-      "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
-      "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
-      "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
-      "Ligature|Idiom|Phrase|" +
-      // These are @deprecated:
-      "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
-      "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
-
   @Override
   public void onHeadingEnd(int depth) {
     final String name = wikiBuilder.toString().trim();
     wikiBuilder = null;
+    currentTranslationSection = null;
+    currentHeading = name;
     
     final boolean lang1 = langPatterns[0].matcher(name).matches();
     final boolean lang2 = langPatterns[1].matcher(name).matches();
@@ -180,7 +302,7 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
     }
     
     if (partOfSpeechHeader.matcher(name).matches()) {
-      currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
+      currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
       currentWord.partsOfSpeech.add(currentPartOfSpeech);
       return;
     }
@@ -194,14 +316,27 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
       }
       currentTranslationSection = new WikiWord.TranslationSection();
       currentPartOfSpeech.translationSections.add(currentTranslationSection);
-    } else {
-      currentTranslationSection = null;
     }
+    
+    if (name.equals("Translations")) {
+      if (currentWord == null || 
+          !currentWord.language.equals("English") || 
+          currentPartOfSpeech == null) {
+        System.out.println("Unexpected Translations section: " + title);
+        return;
+      }
+      currentTranslationSection = new WikiWord.TranslationSection();
+      currentPartOfSpeech.translationSections.add(currentTranslationSection);
+    }
+
   }
 
   @Override
   public void onListItemStart(String header, int[] section) {
     wikiBuilder = new StringBuilder();
+    if (currentWord != null) {
+      currentWord.currentPronunciation = null;
+    }
   }
   
 
@@ -210,10 +345,14 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
     final String item = wikiBuilder.toString();
     wikiBuilder = null;
     
+    if (item.indexOf("{{trreq}}") != -1) {
+      return;
+    }
+    
     if (currentTranslationSection != null) {
       final int colonPos = item.indexOf(':');
       if (colonPos == -1) {
-        System.out.println("Invalid translation: " + item);
+        System.err.println("Invalid translation: " + item);
         return;
       }
       final String lang = item.substring(0, colonPos);
@@ -236,18 +375,6 @@ public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler im
 
   // ----------------------------------------------------------------------
   
-  public void onTransTrop(final String[][] args) {
-    currentTranslationSection = new WikiWord.TranslationSection();
-    currentPartOfSpeech.translationSections.add(currentTranslationSection);
-    
-    if (args.length > 1) {
-      currentTranslationSection.sense = args[1][1];
-    }
-  }
-
-  
-  // ----------------------------------------------------------------------
-
   @Override
   public void onComment(String text) {
   }