1 package com.hughes.android.dictionary.parser;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.LinkedHashSet;
11 import java.util.regex.Pattern;
13 import javax.xml.parsers.ParserConfigurationException;
14 import javax.xml.parsers.SAXParser;
15 import javax.xml.parsers.SAXParserFactory;
17 import org.xml.sax.Attributes;
18 import org.xml.sax.SAXException;
20 import com.hughes.android.dictionary.engine.DictionaryBuilder;
21 import com.hughes.android.dictionary.engine.IndexBuilder;
22 import com.hughes.android.dictionary.parser.WikiWord.FormOf;
23 import com.hughes.android.dictionary.parser.WikiWord.Translation;
24 import com.hughes.util.ListUtil;
25 import com.hughes.util.StringUtil;
26 import com.sun.tools.internal.ws.wsdl.document.jaxws.Exception;
28 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
30 static final Pattern partOfSpeechHeader = Pattern.compile(
31 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
32 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
33 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
34 "Ligature|Idiom|Phrase|" +
35 // These are @deprecated:
36 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
37 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
38 // These are extras I found:
39 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
40 "Particle|Interjection|Pronominal adverb" +
41 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
43 static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
45 final DictionaryBuilder dictBuilder;
47 final IndexBuilder[] indexBuilders;
48 final Pattern[] langPatterns;
49 final int enIndexBuilder;
51 StringBuilder titleBuilder;
52 StringBuilder textBuilder;
53 StringBuilder currentBuilder = null;
55 static void assertTrue(final boolean condition) {
56 assertTrue(condition, "");
59 static void assertTrue(final boolean condition, final String message) {
61 System.err.println("Assertion failed, message: " + message);
62 new RuntimeException().printStackTrace(System.err);
66 public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
67 assertTrue(langPatterns.length == 2);
68 this.dictBuilder = dictBuilder;
69 this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
70 this.langPatterns = langPatterns;
71 this.enIndexBuilder = enIndexBuilder;
75 public void startElement(String uri, String localName, String qName,
76 Attributes attributes) {
77 currentBuilder = null;
78 if ("page".equals(qName)) {
79 titleBuilder = new StringBuilder();
81 // Start with "\n" to better match certain strings.
82 textBuilder = new StringBuilder("\n");
83 } else if ("title".equals(qName)) {
84 currentBuilder = titleBuilder;
85 } else if ("text".equals(qName)) {
86 currentBuilder = textBuilder;
91 public void characters(char[] ch, int start, int length) throws SAXException {
92 if (currentBuilder != null) {
93 currentBuilder.append(ch, start, length);
98 public void endElement(String uri, String localName, String qName)
100 currentBuilder = null;
101 if ("page".equals(qName)) {
107 public void parse(final File file) throws ParserConfigurationException,
108 SAXException, IOException {
109 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
110 parser.parse(file, this);
114 private void endPage() {
115 title = titleBuilder.toString();
117 if (pageCount % 1000 == 0) {
118 System.out.println("pageCount=" + pageCount);
120 if (title.startsWith("Wiktionary:") ||
121 title.startsWith("Template:") ||
122 title.startsWith("Appendix:") ||
123 title.startsWith("Category:") ||
124 title.startsWith("Index:") ||
125 title.startsWith("MediaWiki:") ||
126 title.startsWith("TransWiki:") ||
127 title.startsWith("Citations:") ||
128 title.startsWith("Concordance:") ||
129 title.startsWith("Help:")) {
134 currentHeading = null;
135 insidePartOfSpeech = false;
136 // System.err.println("Working on page: " + title);
138 WikiParser.parse(textBuilder.toString(), this);
139 } catch (Throwable e) {
140 System.err.println("Failure on page: " + title);
141 e.printStackTrace(System.err);
144 for (final WikiWord word : words) {
145 word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
151 // ------------------------------------------------------------------------
152 // ------------------------------------------------------------------------
153 // ------------------------------------------------------------------------
154 // ------------------------------------------------------------------------
157 * Two things can happen:
159 * We can be in a ==German== section. There we will see English definitions.
160 * Each POS should get its own QuickDic entry. Pretty much everything goes
163 * Or we can be in an ==English== section with English definitions
164 * and maybe see translations for languages we care about.
166 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
167 * into separate QuickDic entries, but that's tricky--how do we know when we
168 * found a subsection? Just ignore anything containing pronunciation and
171 * How do we decide when to seal the deal on an entry?
173 * Would be nice if the parser told us about leaving sections....
179 String currentHeading;
181 final List<WikiWord> words = new ArrayList<WikiWord>();
182 WikiWord currentWord;
183 WikiWord.PartOfSpeech currentPartOfSpeech;
184 WikiWord.TranslationSense currentTranslationSense;
185 boolean insidePartOfSpeech;
187 StringBuilder wikiBuilder = null;
190 public void onWikiLink(String[] args) {
191 if (wikiBuilder == null) {
194 wikiBuilder.append(args[args.length - 1]);
197 // ttbc: translations to be checked.
198 static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
199 "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
200 "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
201 "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
202 static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
203 static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
204 static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
207 public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
208 if (positionalArgs.isEmpty()) {
209 // This happens very rarely with special templates.
212 final String name = positionalArgs.get(0);
214 namedArgs.remove("lang");
215 namedArgs.remove("nocat");
216 namedArgs.remove("nocap");
217 namedArgs.remove("sc");
220 if (currentWord != null) {
221 if (name.equals("a")) {
223 currentWord.currentPronunciation = new StringBuilder();
224 currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
228 if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
229 namedArgs.remove("lang");
230 for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
231 final String pron = namedArgs.remove("" + i);
233 positionalArgs.add(pron);
240 if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
241 System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
243 if (currentWord.currentPronunciation == null) {
244 currentWord.currentPronunciation = new StringBuilder();
245 currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
247 if (currentWord.currentPronunciation.length() > 0) {
248 currentWord.currentPronunciation.append("; ");
250 for (int i = 1; i < positionalArgs.size(); ++i) {
252 currentWord.currentPronunciation.append(",");
254 final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
255 currentWord.currentPronunciation.append(pron).append("");
257 currentWord.currentPronunciation.append(" (").append(name).append(")");
261 if (name.equals("qualifier")) {
262 //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString());
263 if (wikiBuilder == null) {
266 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
270 if (name.equals("...")) {
271 // Skipping any elided text for brevity.
272 wikiBuilder.append("...");
276 if (passThroughTemplates.contains(name)) {
277 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs);
278 wikiBuilder.append(name);
282 if (ignoreTemplates.contains(name)) {
286 if ("Pronunciation".equals(currentHeading)) {
287 System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
293 if (insidePartOfSpeech) {
296 if (name.equals("form of")) {
297 namedArgs.remove("sc");
298 if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
299 System.err.println("Invalid form of.");
301 final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
302 final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
303 currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
307 // The fallback plan: append the template!
308 if (wikiBuilder != null) {
309 wikiBuilder.append("{");
310 boolean first = true;
311 for (final String arg : positionalArgs) {
313 wikiBuilder.append(", ");
316 wikiBuilder.append(arg);
318 // This one isn't so useful.
319 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
321 wikiBuilder.append(", ");
324 wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
326 wikiBuilder.append("}");
329 //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
335 if (name.equals("trans-top")) {
336 assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title);
338 if (currentPartOfSpeech == null) {
339 assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(), title);
340 System.err.println("Assuming last part of speech for non-nested translation section: " + title);
341 currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
344 currentTranslationSense = new WikiWord.TranslationSense();
345 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
346 if (positionalArgs.size() > 1) {
347 currentTranslationSense.sense = positionalArgs.get(1);
352 if (wikiBuilder == null) {
355 if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
356 assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString());
357 wikiBuilder.append("{");
358 for (int i = 1; i < positionalArgs.size(); ++i) {
359 wikiBuilder.append(i > 1 ? "," : "");
360 wikiBuilder.append(positionalArgs.get(i));
362 wikiBuilder.append(name).append("}");
364 } else if (name.equals("p")) {
365 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty());
366 wikiBuilder.append("pl.");
368 } else if (name.equals("s")) {
369 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title);
370 wikiBuilder.append("sg.");
372 } else if (grammarTemplates.contains(name)) {
373 assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
374 wikiBuilder.append(name).append(".");
376 } else if (name.equals("l")) {
377 // This template is designed to generate a link to a specific language-section on the target page.
378 wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
380 } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
381 if (positionalArgs.size() > 2) {
382 wikiBuilder.append(positionalArgs.get(2));
384 for (int i = 3; i < positionalArgs.size(); ++i) {
385 wikiBuilder.append(i == 3 ? " {" : ",");
386 wikiBuilder.append(positionalArgs.get(i));
387 wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
389 final String transliteration = namedArgs.remove("tr");
390 if (transliteration != null) {
391 wikiBuilder.append(" (").append(transliteration).append(")");
394 } else if (name.equals("trreq")) {
395 wikiBuilder.append("{{trreq}}");
397 } else if (name.equals("qualifier")) {
398 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
399 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
401 } else if (useRemainingArgTemplates.contains(name)) {
402 for (int i = 1; i < positionalArgs.size(); ++i) {
404 wikiBuilder.append(", ");
406 wikiBuilder.append(positionalArgs.get(i));
408 } else if (ignoreTemplates.contains(name)) {
411 } else if (name.equals("initialism")) {
412 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
413 wikiBuilder.append("Initialism");
414 } else if (name.equals("abbreviation")) {
415 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
416 wikiBuilder.append("Abbreviation");
417 } else if (name.equals("acronym")) {
418 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
419 wikiBuilder.append("Acronym");
421 if (currentTranslationSense != null) {
422 System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
428 public void onText(String text) {
429 if (wikiBuilder != null) {
430 wikiBuilder.append(text);
436 public void onHeadingStart(int depth) {
437 wikiBuilder = new StringBuilder();
438 currentDepth = depth;
439 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
440 currentPartOfSpeech = null;
441 insidePartOfSpeech = false;
443 if (currentWord != null && depth <= currentWord.depth) {
447 currentHeading = null;
451 public void onHeadingEnd(int depth) {
452 final String name = wikiBuilder.toString().trim();
454 currentTranslationSense = null;
455 currentHeading = name;
457 final boolean lang0 = langPatterns[0].matcher(name).matches();
458 final boolean lang1 = langPatterns[1].matcher(name).matches();
459 if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
460 currentWord = new WikiWord(title, depth);
461 if (lang0 && lang1) {
462 System.err.println("Word is indexed in both index1 and index2: " + title);
464 currentWord.language = name;
465 currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
466 words.add(currentWord);
470 if (currentWord == null) {
474 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
475 currentPartOfSpeech = null;
478 insidePartOfSpeech = false;
479 if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
480 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
481 currentWord.partsOfSpeech.add(currentPartOfSpeech);
482 insidePartOfSpeech = true;
486 if (name.equals("Translations")) {
487 if (currentWord == null ||
488 !currentWord.language.equals("English") ||
489 currentPartOfSpeech == null) {
490 System.err.println("Unexpected Translations section: " + title);
493 currentTranslationSense = new WikiWord.TranslationSense();
499 public void onListItemStart(String header, int[] section) {
500 wikiBuilder = new StringBuilder();
501 if (currentWord != null) {
502 currentWord.currentPronunciation = null;
508 public void onListItemEnd(String header, int[] section) {
509 String item = wikiBuilder.toString().trim();
510 final String oldItem = item;
511 if (item.length() == 0) {
514 item = WikiParser.simpleParse(item);
518 if (insidePartOfSpeech) {
519 assert currentPartOfSpeech != null : title + item;
520 if (header.equals("#") ||
521 header.equals("##") ||
522 header.equals("###") ||
523 header.equals("####") ||
524 header.equals(":#") ||
525 header.equals("::") ||
526 header.equals(":::*")) {
528 // :: should append, probably.
529 currentPartOfSpeech.newMeaning().meaning = item;
532 } else if (header.equals("#*") ||
533 header.equals("##*") ||
534 header.equals("###*")) {
535 currentPartOfSpeech.lastMeaning().newExample().source = item;
538 } else if (header.equals("#:") ||
539 header.equals("#*:") ||
540 header.equals("#:*") ||
541 header.equals("##:") ||
542 header.equals("##*:") ||
543 header.equals("#:*:") ||
544 header.equals("#:*#") ||
545 header.equals("#*:") ||
546 header.equals("*:") ||
547 header.equals("#:::") ||
548 header.equals("#**") ||
549 header.equals("#*:::") ||
550 header.equals("#:#") ||
551 header.equals(":::") ||
552 header.equals("##:*") ||
553 header.equals("###*:")) {
554 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
556 // Example in English
557 } else if (header.equals("#::") ||
558 header.equals("#*::") ||
559 header.equals("#:**") ||
560 header.equals("#*#") ||
561 header.equals("##*::")) {
562 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
565 } else if (header.equals("*") ||
566 header.equals("**") ||
567 header.equals("***") ||
568 header.equals("*#") ||
569 header.equals(":") ||
570 header.equals("::*") ||
571 header.equals("#**") ||
572 header.equals(":*") ||
573 header.equals("#*:*") ||
574 header.equals("#*:**") ||
575 header.equals("#*:#") ||
576 header.equals("#*:*:") ||
577 header.equals("#*:*") ||
578 header.equals(";")) {
579 // might have: * {{seeCites}}
580 // * [[w:Arabic numerals|Arabic numerals]]: 2
581 //assert item.trim().length() == 0;
582 System.err.println("Skipping meaning: " + header + " " + item);
584 if (title.equals("Yellowknife")) {
587 System.err.println("Busted heading: " + title + " "+ header + " " + item);
594 if (currentTranslationSense != null) {
595 if (item.indexOf("{{[trreq]{}}}") != -1) {
599 if (currentPartOfSpeech.translationSenses.isEmpty()) {
600 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
603 final int colonPos = item.indexOf(':');
604 if (colonPos == -1) {
605 System.err.println("Invalid translation: title=" + title + ", item=" + item);
608 final String lang = item.substring(0, colonPos);
609 final String trans = item.substring(colonPos + 1).trim();
610 for (int i = 0; i < 2; ++i) {
611 if (langPatterns[i].matcher(lang).find()) {
612 currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
619 public void onNewLine() {
623 public void onNewParagraph() {
626 // ----------------------------------------------------------------------
629 public void onComment(String text) {
633 public void onFormatBold(boolean boldOn) {
637 public void onFormatItalic(boolean italicOn) {
641 public void onUnterminated(String start, String rest) {
642 System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
645 public void onInvalidHeaderEnd(String rest) {
646 throw new RuntimeException(rest);