1 package com.hughes.android.dictionary.parser;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.LinkedHashSet;
11 import java.util.regex.Pattern;
13 import javax.xml.parsers.ParserConfigurationException;
14 import javax.xml.parsers.SAXParser;
15 import javax.xml.parsers.SAXParserFactory;
17 import org.xml.sax.Attributes;
18 import org.xml.sax.SAXException;
20 import com.hughes.android.dictionary.engine.DictionaryBuilder;
21 import com.hughes.android.dictionary.engine.IndexBuilder;
22 import com.hughes.android.dictionary.parser.WikiWord.FormOf;
23 import com.hughes.android.dictionary.parser.WikiWord.Translation;
24 import com.hughes.util.ListUtil;
25 import com.hughes.util.StringUtil;
27 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
29 static final Pattern partOfSpeechHeader = Pattern.compile(
30 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
31 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
32 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
33 "Ligature|Idiom|Phrase|" +
34 // These are @deprecated:
35 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
36 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
37 // These are extras I found:
38 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
39 "Particle|Interjection|Pronominal adverb" +
40 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
42 static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
44 final DictionaryBuilder dictBuilder;
46 final IndexBuilder[] indexBuilders;
47 final Pattern[] langPatterns;
48 final int enIndexBuilder;
50 StringBuilder titleBuilder;
51 StringBuilder textBuilder;
52 StringBuilder currentBuilder = null;
54 public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
55 assert langPatterns.length == 2;
56 this.dictBuilder = dictBuilder;
57 this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
58 this.langPatterns = langPatterns;
59 this.enIndexBuilder = enIndexBuilder;
63 public void startElement(String uri, String localName, String qName,
64 Attributes attributes) {
65 currentBuilder = null;
66 if ("page".equals(qName)) {
67 titleBuilder = new StringBuilder();
69 // Start with "\n" to better match certain strings.
70 textBuilder = new StringBuilder("\n");
71 } else if ("title".equals(qName)) {
72 currentBuilder = titleBuilder;
73 } else if ("text".equals(qName)) {
74 currentBuilder = textBuilder;
79 public void characters(char[] ch, int start, int length) throws SAXException {
80 if (currentBuilder != null) {
81 currentBuilder.append(ch, start, length);
86 public void endElement(String uri, String localName, String qName)
88 currentBuilder = null;
89 if ("page".equals(qName)) {
95 public void parse(final File file) throws ParserConfigurationException,
96 SAXException, IOException {
97 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
98 parser.parse(file, this);
102 private void endPage() {
103 title = titleBuilder.toString();
105 if (pageCount % 1000 == 0) {
106 System.out.println("pageCount=" + pageCount);
108 if (title.startsWith("Wiktionary:") ||
109 title.startsWith("Template:") ||
110 title.startsWith("Appendix:") ||
111 title.startsWith("Category:") ||
112 title.startsWith("Index:") ||
113 title.startsWith("MediaWiki:") ||
114 title.startsWith("TransWiki:") ||
115 title.startsWith("Citations:") ||
116 title.startsWith("Concordance:") ||
117 title.startsWith("Help:")) {
122 currentHeading = null;
123 insidePartOfSpeech = false;
124 // System.err.println("Working on page: " + title);
126 WikiParser.parse(textBuilder.toString(), this);
127 } catch (Throwable e) {
128 System.err.println("Failure on page: " + title);
129 e.printStackTrace(System.err);
132 for (final WikiWord word : words) {
133 word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
139 // ------------------------------------------------------------------------
140 // ------------------------------------------------------------------------
141 // ------------------------------------------------------------------------
142 // ------------------------------------------------------------------------
145 * Two things can happen:
147 * We can be in a ==German== section. There we will see English definitions.
148 * Each POS should get its own QuickDic entry. Pretty much everything goes
151 * Or we can be in an ==English== section with English definitions
152 * and maybe see translations for languages we care about.
154 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
155 * into separate QuickDic entries, but that's tricky--how do we know when we
156 * found a subsection? Just ignore anything containing pronunciation and
159 * How do we decide when to seal the deal on an entry?
161 * Would be nice if the parser told us about leaving sections....
167 String currentHeading;
169 final List<WikiWord> words = new ArrayList<WikiWord>();
170 WikiWord currentWord;
171 WikiWord.PartOfSpeech currentPartOfSpeech;
172 WikiWord.TranslationSense currentTranslationSense;
173 boolean insidePartOfSpeech;
175 StringBuilder wikiBuilder = null;
178 public void onWikiLink(String[] args) {
179 if (wikiBuilder == null) {
182 wikiBuilder.append(args[args.length - 1]);
185 // ttbc: translations to be checked.
186 static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
187 "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
188 "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
189 "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
190 static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
191 static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
192 static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
195 public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
196 if (positionalArgs.isEmpty()) {
197 // This happens very rarely with special templates.
200 final String name = positionalArgs.get(0);
202 namedArgs.remove("lang");
203 namedArgs.remove("nocat");
204 namedArgs.remove("nocap");
205 namedArgs.remove("sc");
208 if (currentWord != null) {
209 if (name.equals("a")) {
211 currentWord.currentPronunciation = new StringBuilder();
212 currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
216 if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
217 namedArgs.remove("lang");
218 for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
219 final String pron = namedArgs.remove("" + i);
221 positionalArgs.add(pron);
228 if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
229 System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
231 if (currentWord.currentPronunciation == null) {
232 currentWord.currentPronunciation = new StringBuilder();
233 currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
235 if (currentWord.currentPronunciation.length() > 0) {
236 currentWord.currentPronunciation.append("; ");
238 for (int i = 1; i < positionalArgs.size(); ++i) {
240 currentWord.currentPronunciation.append(",");
242 final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
243 currentWord.currentPronunciation.append(pron).append("");
245 currentWord.currentPronunciation.append(" (").append(name).append(")");
249 if (name.equals("qualifier")) {
250 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
251 if (wikiBuilder == null) {
254 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
258 if (name.equals("...")) {
259 // Skipping any elided text for brevity.
260 wikiBuilder.append("...");
264 if (passThroughTemplates.contains(name)) {
265 assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
266 wikiBuilder.append(name);
270 if (ignoreTemplates.contains(name)) {
274 if ("Pronunciation".equals(currentHeading)) {
275 System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
281 if (insidePartOfSpeech) {
284 if (name.equals("form of")) {
285 namedArgs.remove("sc");
286 if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
287 System.err.println("Invalid form of.");
289 final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
290 final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
291 currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
295 // The fallback plan: append the template!
296 if (wikiBuilder != null) {
297 wikiBuilder.append("{");
298 boolean first = true;
299 for (final String arg : positionalArgs) {
301 wikiBuilder.append(", ");
304 wikiBuilder.append(arg);
306 // This one isn't so useful.
307 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
309 wikiBuilder.append(", ");
312 wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
314 wikiBuilder.append("}");
317 //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
323 if (name.equals("trans-top")) {
324 assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs + title;
326 if (currentPartOfSpeech == null) {
327 assert currentWord != null && !currentWord.partsOfSpeech.isEmpty() : title;
328 System.err.println("Assuming last part of speech for non-nested translation section: " + title);
329 currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
332 currentTranslationSense = new WikiWord.TranslationSense();
333 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
334 if (positionalArgs.size() > 1) {
335 currentTranslationSense.sense = positionalArgs.get(1);
340 if (wikiBuilder == null) {
343 if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
344 assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
345 wikiBuilder.append("{");
346 for (int i = 1; i < positionalArgs.size(); ++i) {
347 wikiBuilder.append(i > 1 ? "," : "");
348 wikiBuilder.append(positionalArgs.get(i));
350 wikiBuilder.append(name).append("}");
352 } else if (name.equals("p")) {
353 assert positionalArgs.size() == 1 && namedArgs.isEmpty();
354 wikiBuilder.append("pl.");
356 } else if (name.equals("s")) {
357 assert positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra");
358 wikiBuilder.append("sg.");
360 } else if (grammarTemplates.contains(name)) {
361 assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
362 wikiBuilder.append(name).append(".");
364 } else if (name.equals("l")) {
365 // This template is designed to generate a link to a specific language-section on the target page.
366 wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
368 } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
369 if (positionalArgs.size() > 2) {
370 wikiBuilder.append(positionalArgs.get(2));
372 for (int i = 3; i < positionalArgs.size(); ++i) {
373 wikiBuilder.append(i == 3 ? " {" : ",");
374 wikiBuilder.append(positionalArgs.get(i));
375 wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
377 final String transliteration = namedArgs.remove("tr");
378 if (transliteration != null) {
379 wikiBuilder.append(" (").append(transliteration).append(")");
382 } else if (name.equals("trreq")) {
383 wikiBuilder.append("{{trreq}}");
385 } else if (name.equals("qualifier")) {
386 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
387 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
389 } else if (useRemainingArgTemplates.contains(name)) {
390 for (int i = 1; i < positionalArgs.size(); ++i) {
392 wikiBuilder.append(", ");
394 wikiBuilder.append(positionalArgs.get(i));
396 } else if (ignoreTemplates.contains(name)) {
399 } else if (name.equals("initialism")) {
400 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
401 wikiBuilder.append("Initialism");
402 } else if (name.equals("abbreviation")) {
403 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
404 wikiBuilder.append("Abbreviation");
405 } else if (name.equals("acronym")) {
406 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
407 wikiBuilder.append("Acronym");
409 if (currentTranslationSense != null) {
410 System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
416 public void onText(String text) {
417 if (wikiBuilder != null) {
418 wikiBuilder.append(text);
424 public void onHeadingStart(int depth) {
425 wikiBuilder = new StringBuilder();
426 currentDepth = depth;
427 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
428 currentPartOfSpeech = null;
429 insidePartOfSpeech = false;
431 if (currentWord != null && depth <= currentWord.depth) {
435 currentHeading = null;
439 public void onHeadingEnd(int depth) {
440 final String name = wikiBuilder.toString().trim();
442 currentTranslationSense = null;
443 currentHeading = name;
445 final boolean lang0 = langPatterns[0].matcher(name).matches();
446 final boolean lang1 = langPatterns[1].matcher(name).matches();
447 if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
448 currentWord = new WikiWord(title, depth);
449 if (lang0 && lang1) {
450 System.err.println("Word is indexed in both index1 and index2: " + title);
452 currentWord.language = name;
453 currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
454 words.add(currentWord);
458 if (currentWord == null) {
462 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
463 currentPartOfSpeech = null;
466 insidePartOfSpeech = false;
467 if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
468 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
469 currentWord.partsOfSpeech.add(currentPartOfSpeech);
470 insidePartOfSpeech = true;
474 if (name.equals("Translations")) {
475 if (currentWord == null ||
476 !currentWord.language.equals("English") ||
477 currentPartOfSpeech == null) {
478 System.err.println("Unexpected Translations section: " + title);
481 currentTranslationSense = new WikiWord.TranslationSense();
487 public void onListItemStart(String header, int[] section) {
488 wikiBuilder = new StringBuilder();
489 if (currentWord != null) {
490 currentWord.currentPronunciation = null;
496 public void onListItemEnd(String header, int[] section) {
497 String item = wikiBuilder.toString().trim();
498 final String oldItem = item;
499 if (item.length() == 0) {
502 item = WikiParser.simpleParse(item);
506 if (insidePartOfSpeech) {
507 assert currentPartOfSpeech != null : title + item;
508 if (header.equals("#") ||
509 header.equals("##") ||
510 header.equals("###") ||
511 header.equals("####") ||
512 header.equals(":#") ||
513 header.equals("::") ||
514 header.equals(":::*")) {
516 // :: should append, probably.
517 currentPartOfSpeech.newMeaning().meaning = item;
520 } else if (header.equals("#*") ||
521 header.equals("##*") ||
522 header.equals("###*")) {
523 currentPartOfSpeech.lastMeaning().newExample().source = item;
526 } else if (header.equals("#:") ||
527 header.equals("#*:") ||
528 header.equals("#:*") ||
529 header.equals("##:") ||
530 header.equals("##*:") ||
531 header.equals("#:*:") ||
532 header.equals("#:*#") ||
533 header.equals("#*:") ||
534 header.equals("*:") ||
535 header.equals("#:::") ||
536 header.equals("#**") ||
537 header.equals("#*:::") ||
538 header.equals("#:#") ||
539 header.equals(":::") ||
540 header.equals("##:*") ||
541 header.equals("###*:")) {
542 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
544 // Example in English
545 } else if (header.equals("#::") ||
546 header.equals("#*::") ||
547 header.equals("#:**") ||
548 header.equals("#*#") ||
549 header.equals("##*::")) {
550 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
553 } else if (header.equals("*") ||
554 header.equals("**") ||
555 header.equals("***") ||
556 header.equals("*#") ||
557 header.equals(":") ||
558 header.equals("::*") ||
559 header.equals("#**") ||
560 header.equals(":*") ||
561 header.equals("#*:*") ||
562 header.equals("#*:**") ||
563 header.equals("#*:#") ||
564 header.equals("#*:*:") ||
565 header.equals("#*:*") ||
566 header.equals(";")) {
567 // might have: * {{seeCites}}
568 // * [[w:Arabic numerals|Arabic numerals]]: 2
569 //assert item.trim().length() == 0;
570 System.err.println("Skipping meaning: " + header + " " + item);
572 if (title.equals("Yellowknife")) {
575 System.err.println("Busted heading: " + title + " "+ header + " " + item);
582 if (currentTranslationSense != null) {
583 if (item.indexOf("{{[trreq]{}}}") != -1) {
587 if (currentPartOfSpeech.translationSenses.isEmpty()) {
588 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
591 final int colonPos = item.indexOf(':');
592 if (colonPos == -1) {
593 System.err.println("Invalid translation: title=" + title + ", item=" + item);
596 final String lang = item.substring(0, colonPos);
597 final String trans = item.substring(colonPos + 1).trim();
598 for (int i = 0; i < 2; ++i) {
599 if (langPatterns[i].matcher(lang).find()) {
600 currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
607 public void onNewLine() {
611 public void onNewParagraph() {
614 // ----------------------------------------------------------------------
617 public void onComment(String text) {
621 public void onFormatBold(boolean boldOn) {
625 public void onFormatItalic(boolean italicOn) {
629 public void onUnterminated(String start, String rest) {
630 System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
633 public void onInvalidHeaderEnd(String rest) {
634 throw new RuntimeException(rest);