import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import com.hughes.android.dictionary.engine.DictionaryBuilder;
import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiWord.TranslationSection;
public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+
+ static final Pattern partOfSpeechHeader = Pattern.compile(
+ "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+ "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+ "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+ "Ligature|Idiom|Phrase|" +
+ // These are @deprecated:
+ "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+ "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
+
+ static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
+
final DictionaryBuilder dict;
title = titleBuilder.toString();
currentDepth = 0;
words.clear();
+ currentHeading = null;
WikiParser.parse(textBuilder.toString(), this);
+
+ for (final WikiWord word : words) {
+ System.out.println("\n" + title + ", " + word.language + ", pron=" + word.accentToPronunciation);
+ if (word.partsOfSpeech.isEmpty() && title.indexOf(":") == -1) {
+ System.err.println("Word with no POS: " + title);
+ }
+ for (final WikiWord.PartOfSpeech partOfSpeech : word.partsOfSpeech) {
+ System.out.println(" pos: " + partOfSpeech.name);
+
+ for (final TranslationSection translationSection : partOfSpeech.translationSections) {
+ System.out.println(" sense: " + translationSection.sense);
+
+ }
+ }
+ }
}
+
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+
/**
* Two things can happen:
*
*/
String title;
+ String currentHeading;
int currentDepth;
final List<WikiWord> words = new ArrayList<WikiWord>();
WikiWord currentWord;
StringBuilder wikiBuilder = null;
- // ------------------------------------------------------------------------
-
@Override
public void onWikiLink(String[] args) {
- if (wikiBuilder != null) {
- wikiBuilder.append(args[args.length - 1]);
+ if (wikiBuilder == null) {
+ return;
}
+ wikiBuilder.append(args[args.length - 1]);
}
+
+ // ttbc: translations to be checked.
+ static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
+ "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
+ "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
+ "zh-tsp", "zh-zh-p"));
+ static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
+ static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
@Override
- public void onTemplate(String[][] args) {
- final String name = args[0][1];
+ public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+ final String name = positionalArgs.get(0);
+
+ // Pronunciation
+ if (name.equals("a")) {
+ // accent tag
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
+ return;
+ }
+ if (name.equals("IPA") || name.equals("SAMPA") || name.equals("enPR") || name.equals("rhymes")) {
+ namedArgs.remove("lang");
+ assert positionalArgs.size() >= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
+ if (currentWord.currentPronunciation == null) {
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
+ }
+ currentWord.currentPronunciation.append(name).append(": ");
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i > 1) {
+ currentWord.currentPronunciation.append(", ");
+ }
+ final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
+ currentWord.currentPronunciation.append(pron).append("");
+ }
+ return;
+ }
+ if (name.equals("audio")) {
+ return;
+ }
+ if ("Pronunciation".equals(currentHeading)) {
+ System.err.println("Unhandled template: " + name);
+ }
+
+ // Translations
+ if (name.equals("trans-top")) {
+ assert positionalArgs.size() == 2 && namedArgs.isEmpty();
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+ if (positionalArgs.size() > 1) {
+ currentTranslationSection.sense = positionalArgs.get(1);
+ }
+ return;
+ }
+
+ if (wikiBuilder == null) {
+ return;
+ }
if (name == "") {
-
+ } else if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
+ wikiBuilder.append("{").append(name).append("}");
+ } else if (name.equals("p")) {
+ wikiBuilder.append("pl.");
+ } else if (name.equals("s")) {
+ wikiBuilder.append("sg.");
+ } else if (grammarTemplates.contains(name)) {
+ wikiBuilder.append(name).append(".");
+ } else if (name.equals("l")) {
+ wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
+ } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
+ if (positionalArgs.size() >= 2) {
+ wikiBuilder.append(positionalArgs.get(1));
+ }
+ if (positionalArgs.size() >= 3) {
+ wikiBuilder.append(" {").append(positionalArgs.get(1)).append("}");
+ }
+ final String transliteration = namedArgs.remove("tr");
+ if (transliteration != null) {
+ wikiBuilder.append(" (").append(transliteration).append(")");
+ }
+ } else if (name.equals("trreq")) {
+ wikiBuilder.append("{{trreq}}");
+ } else if (name.equals("qualifier")) {
+ wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+ } else if (useRemainingArgTemplates.contains(name)) {
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i != 1) {
+ wikiBuilder.append(", ");
+ }
+ wikiBuilder.append(positionalArgs.get(i));
+ }
+ } else if (ignoreTemplates.contains(name)) {
+ } else if (name.equals("initialism")) {
+ wikiBuilder.append("Initialism");
} else {
- //System.out.println("Unhandled template: " + name);
+ if (currentTranslationSection != null) {
+ System.err.println("Unhandled template: " + name);
+ }
}
}
}
}
- final Pattern partOfSpeechHeader = Pattern.compile(
- "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
- "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
- "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
- "Ligature|Idiom|Phrase|" +
- // These are @deprecated:
- "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
- "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
-
@Override
public void onHeadingEnd(int depth) {
final String name = wikiBuilder.toString().trim();
wikiBuilder = null;
+ currentTranslationSection = null;
+ currentHeading = name;
final boolean lang1 = langPatterns[0].matcher(name).matches();
final boolean lang2 = langPatterns[1].matcher(name).matches();
}
if (partOfSpeechHeader.matcher(name).matches()) {
- currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
+ currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
currentWord.partsOfSpeech.add(currentPartOfSpeech);
return;
}
}
currentTranslationSection = new WikiWord.TranslationSection();
currentPartOfSpeech.translationSections.add(currentTranslationSection);
- } else {
- currentTranslationSection = null;
}
+
+ if (name.equals("Translations")) {
+ if (currentWord == null ||
+ !currentWord.language.equals("English") ||
+ currentPartOfSpeech == null) {
+ System.out.println("Unexpected Translations section: " + title);
+ return;
+ }
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+ }
+
}
@Override
public void onListItemStart(String header, int[] section) {
wikiBuilder = new StringBuilder();
+ if (currentWord != null) {
+ currentWord.currentPronunciation = null;
+ }
}
final String item = wikiBuilder.toString();
wikiBuilder = null;
+ if (item.indexOf("{{trreq}}") != -1) {
+ return;
+ }
+
if (currentTranslationSection != null) {
final int colonPos = item.indexOf(':');
if (colonPos == -1) {
- System.out.println("Invalid translation: " + item);
+ System.err.println("Invalid translation: " + item);
return;
}
final String lang = item.substring(0, colonPos);
// ----------------------------------------------------------------------
- public void onTransTrop(final String[][] args) {
- currentTranslationSection = new WikiWord.TranslationSection();
- currentPartOfSpeech.translationSections.add(currentTranslationSection);
-
- if (args.length > 1) {
- currentTranslationSection.sense = args[1][1];
- }
- }
-
-
- // ----------------------------------------------------------------------
-
@Override
public void onComment(String text) {
}