1 package com.hughes.android.dictionary.parser;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.LinkedHashSet;
11 import java.util.regex.Pattern;
13 import javax.xml.parsers.ParserConfigurationException;
14 import javax.xml.parsers.SAXParser;
15 import javax.xml.parsers.SAXParserFactory;
17 import org.xml.sax.Attributes;
18 import org.xml.sax.SAXException;
20 import com.hughes.android.dictionary.engine.DictionaryBuilder;
21 import com.hughes.android.dictionary.engine.IndexBuilder;
22 import com.hughes.android.dictionary.parser.WikiWord.FormOf;
23 import com.hughes.android.dictionary.parser.WikiWord.Translation;
24 import com.hughes.util.ListUtil;
25 import com.hughes.util.StringUtil;
27 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
29 static final Pattern partOfSpeechHeader = Pattern.compile(
30 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
31 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
32 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
33 "Ligature|Idiom|Phrase|" +
34 // These are @deprecated:
35 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
36 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
37 // These are extras I found:
38 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
39 "Particle|Interjection|Pronominal adverb" +
40 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
42 static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
44 final DictionaryBuilder dictBuilder;
46 final IndexBuilder[] indexBuilders;
47 final Pattern[] langPatterns;
48 final int enIndexBuilder;
50 StringBuilder titleBuilder;
51 StringBuilder textBuilder;
52 StringBuilder currentBuilder = null;
54 public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
55 assert langPatterns.length == 2;
56 this.dictBuilder = dictBuilder;
57 this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
58 this.langPatterns = langPatterns;
59 this.enIndexBuilder = enIndexBuilder;
63 public void startElement(String uri, String localName, String qName,
64 Attributes attributes) {
65 currentBuilder = null;
66 if ("page".equals(qName)) {
67 titleBuilder = new StringBuilder();
69 // Start with "\n" to better match certain strings.
70 textBuilder = new StringBuilder("\n");
71 } else if ("title".equals(qName)) {
72 currentBuilder = titleBuilder;
73 } else if ("text".equals(qName)) {
74 currentBuilder = textBuilder;
79 public void characters(char[] ch, int start, int length) throws SAXException {
80 if (currentBuilder != null) {
81 currentBuilder.append(ch, start, length);
86 public void endElement(String uri, String localName, String qName)
88 currentBuilder = null;
89 if ("page".equals(qName)) {
95 public void parse(final File file) throws ParserConfigurationException,
96 SAXException, IOException {
97 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
98 parser.parse(file, this);
102 private void endPage() {
103 title = titleBuilder.toString();
105 if (pageCount % 1000 == 0) {
106 System.out.println("pageCount=" + pageCount);
108 if (title.startsWith("Wiktionary:") ||
109 title.startsWith("Template:") ||
110 title.startsWith("Appendix:") ||
111 title.startsWith("Category:") ||
112 title.startsWith("Index:") ||
113 title.startsWith("MediaWiki:") ||
114 title.startsWith("TransWiki:") ||
115 title.startsWith("Citations:") ||
116 title.startsWith("Concordance:") ||
117 title.startsWith("Help:")) {
122 currentHeading = null;
123 insidePartOfSpeech = false;
124 // System.err.println("Working on page: " + title);
126 WikiParser.parse(textBuilder.toString(), this);
127 } catch (Throwable e) {
128 System.err.println("Failure on page: " + title);
129 e.printStackTrace(System.err);
132 for (final WikiWord word : words) {
133 word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
139 // ------------------------------------------------------------------------
140 // ------------------------------------------------------------------------
141 // ------------------------------------------------------------------------
142 // ------------------------------------------------------------------------
145 * Two things can happen:
147 * We can be in a ==German== section. There we will see English definitions.
148 * Each POS should get its own QuickDic entry. Pretty much everything goes
151 * Or we can be in an ==English== section with English definitions
152 * and maybe see translations for languages we care about.
154 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
155 * into separate QuickDic entries, but that's tricky--how do we know when we
156 * found a subsection? Just ignore anything containing pronunciation and
159 * How do we decide when to seal the deal on an entry?
161 * Would be nice if the parser told us about leaving sections....
167 String currentHeading;
169 final List<WikiWord> words = new ArrayList<WikiWord>();
170 WikiWord currentWord;
171 WikiWord.PartOfSpeech currentPartOfSpeech;
172 WikiWord.TranslationSense currentTranslationSense;
173 boolean insidePartOfSpeech;
175 StringBuilder wikiBuilder = null;
178 public void onWikiLink(String[] args) {
179 if (wikiBuilder == null) {
182 wikiBuilder.append(args[args.length - 1]);
185 // ttbc: translations to be checked.
186 static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
187 "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
188 "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
189 "zh-tsp", "zh-zh-p"));
190 static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
191 static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
192 static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
195 public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
196 if (positionalArgs.isEmpty()) {
197 // This happens very rarely with special templates.
200 final String name = positionalArgs.get(0);
202 namedArgs.remove("lang");
203 namedArgs.remove("nocat");
204 namedArgs.remove("sc");
207 if (currentWord != null) {
208 if (name.equals("a")) {
210 currentWord.currentPronunciation = new StringBuilder();
211 currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
215 if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA")|| name.equals("enPR")) {
216 namedArgs.remove("lang");
217 for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
218 final String pron = namedArgs.remove("" + i);
220 positionalArgs.add(pron);
227 if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
228 System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
230 if (currentWord.currentPronunciation == null) {
231 currentWord.currentPronunciation = new StringBuilder();
232 currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
234 if (currentWord.currentPronunciation.length() > 0) {
235 currentWord.currentPronunciation.append("; ");
237 for (int i = 1; i < positionalArgs.size(); ++i) {
239 currentWord.currentPronunciation.append(",");
241 final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
242 currentWord.currentPronunciation.append(pron).append("");
244 currentWord.currentPronunciation.append(" (").append(name).append(")");
248 if (name.equals("qualifier")) {
249 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
250 if (wikiBuilder == null) {
253 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
257 if (name.equals("...")) {
258 // Skipping any elided text for brevity.
259 wikiBuilder.append("...");
263 if (passThroughTemplates.contains(name)) {
264 assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
265 wikiBuilder.append(name);
269 if (name.equals("audio") || name.equals("rhymes") || name.equals("hyphenation")) {
273 if ("Pronunciation".equals(currentHeading)) {
274 System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
280 if (insidePartOfSpeech) {
283 if (name.equals("form of")) {
284 namedArgs.remove("sc");
285 if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
286 System.err.println("Invalid form of.");
288 final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
289 final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
290 currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
294 // The fallback plan: append the template!
295 if (wikiBuilder != null) {
296 wikiBuilder.append("{");
297 boolean first = true;
298 for (final String arg : positionalArgs) {
300 wikiBuilder.append(", ");
303 wikiBuilder.append(arg);
305 // This one isn't so useful.
306 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
308 wikiBuilder.append(", ");
311 wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
313 wikiBuilder.append("}");
316 //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
322 if (name.equals("trans-top")) {
323 assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs + title;
325 if (currentPartOfSpeech == null) {
326 assert !currentWord.partsOfSpeech.isEmpty() : title;
327 System.err.println("Assuming last part of speech for non-nested translation section: " + title);
328 currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
331 currentTranslationSense = new WikiWord.TranslationSense();
332 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
333 if (positionalArgs.size() > 1) {
334 currentTranslationSense.sense = positionalArgs.get(1);
339 if (wikiBuilder == null) {
342 if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
343 assert positionalArgs.size() >= 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
344 wikiBuilder.append("{");
345 for (int i = 1; i < positionalArgs.size(); ++i) {
346 wikiBuilder.append(i > 1 ? "," : "");
347 wikiBuilder.append(positionalArgs.get(i));
349 wikiBuilder.append(name).append("}");
351 } else if (name.equals("p")) {
352 assert positionalArgs.size() == 1 && namedArgs.isEmpty();
353 wikiBuilder.append("pl.");
355 } else if (name.equals("s")) {
356 assert positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra");
357 wikiBuilder.append("sg.");
359 } else if (grammarTemplates.contains(name)) {
360 assert positionalArgs.size() == 1 && namedArgs.isEmpty();
361 wikiBuilder.append(name).append(".");
363 } else if (name.equals("l")) {
364 // This template is designed to generate a link to a specific language-section on the target page.
365 wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
367 } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
368 if (positionalArgs.size() > 2) {
369 wikiBuilder.append(positionalArgs.get(2));
371 for (int i = 3; i < positionalArgs.size(); ++i) {
372 wikiBuilder.append(i == 3 ? " {" : ",");
373 wikiBuilder.append(positionalArgs.get(i));
374 wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
376 final String transliteration = namedArgs.remove("tr");
377 if (transliteration != null) {
378 wikiBuilder.append(" (").append(transliteration).append(")");
381 } else if (name.equals("trreq")) {
382 wikiBuilder.append("{{trreq}}");
384 } else if (name.equals("qualifier")) {
385 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
386 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
388 } else if (useRemainingArgTemplates.contains(name)) {
389 for (int i = 1; i < positionalArgs.size(); ++i) {
391 wikiBuilder.append(", ");
393 wikiBuilder.append(positionalArgs.get(i));
395 } else if (ignoreTemplates.contains(name)) {
398 } else if (name.equals("initialism")) {
399 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
400 wikiBuilder.append("Initialism");
401 } else if (name.equals("abbreviation")) {
402 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
403 wikiBuilder.append("Abbreviation");
404 } else if (name.equals("acronym")) {
405 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
406 wikiBuilder.append("Acronym");
408 if (currentTranslationSense != null) {
409 System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
415 public void onText(String text) {
416 if (wikiBuilder != null) {
417 wikiBuilder.append(text);
423 public void onHeadingStart(int depth) {
424 wikiBuilder = new StringBuilder();
425 currentDepth = depth;
426 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
427 currentPartOfSpeech = null;
428 insidePartOfSpeech = false;
430 if (currentWord != null && depth <= currentWord.depth) {
434 currentHeading = null;
438 public void onHeadingEnd(int depth) {
439 final String name = wikiBuilder.toString().trim();
441 currentTranslationSense = null;
442 currentHeading = name;
444 final boolean lang0 = langPatterns[0].matcher(name).matches();
445 final boolean lang1 = langPatterns[1].matcher(name).matches();
446 if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
447 currentWord = new WikiWord(title, depth);
448 if (lang0 && lang1) {
449 System.err.println("Word is indexed in both index1 and index2: " + title);
451 currentWord.language = name;
452 currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
453 words.add(currentWord);
457 if (currentWord == null) {
461 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
462 currentPartOfSpeech = null;
465 insidePartOfSpeech = false;
466 if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
467 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
468 currentWord.partsOfSpeech.add(currentPartOfSpeech);
469 insidePartOfSpeech = true;
473 if (name.equals("Translations")) {
474 if (currentWord == null ||
475 !currentWord.language.equals("English") ||
476 currentPartOfSpeech == null) {
477 System.err.println("Unexpected Translations section: " + title);
480 currentTranslationSense = new WikiWord.TranslationSense();
486 public void onListItemStart(String header, int[] section) {
487 wikiBuilder = new StringBuilder();
488 if (currentWord != null) {
489 currentWord.currentPronunciation = null;
495 public void onListItemEnd(String header, int[] section) {
496 String item = wikiBuilder.toString().trim();
497 final String oldItem = item;
498 if (item.length() == 0) {
501 item = WikiParser.simpleParse(item);
505 if (insidePartOfSpeech) {
506 assert currentPartOfSpeech != null : title + item;
507 if (header.equals("#") ||
508 header.equals("##") ||
509 header.equals("###") ||
510 header.equals("####") ||
511 header.equals(":#") ||
512 header.equals("::") ||
513 header.equals(":::*")) {
515 // :: should append, probably.
516 currentPartOfSpeech.newMeaning().meaning = item;
519 } else if (header.equals("#*") ||
520 header.equals("##*") ||
521 header.equals("###*")) {
522 currentPartOfSpeech.lastMeaning().newExample().source = item;
525 } else if (header.equals("#:") ||
526 header.equals("#*:") ||
527 header.equals("#:*") ||
528 header.equals("##:") ||
529 header.equals("##*:") ||
530 header.equals("#:*:") ||
531 header.equals("#:*#") ||
532 header.equals("#*:") ||
533 header.equals("*:") ||
534 header.equals("#:::") ||
535 header.equals("#**") ||
536 header.equals("#*:::") ||
537 header.equals("#:#") ||
538 header.equals(":::") ||
539 header.equals("##:*") ||
540 header.equals("###*:")) {
541 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
543 // Example in English
544 } else if (header.equals("#::") ||
545 header.equals("#*::") ||
546 header.equals("#:**") ||
547 header.equals("#*#") ||
548 header.equals("##*::")) {
549 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
552 } else if (header.equals("*") ||
553 header.equals("**") ||
554 header.equals("***") ||
555 header.equals("*#") ||
556 header.equals(":") ||
557 header.equals("::*") ||
558 header.equals("#**") ||
559 header.equals(":*") ||
560 header.equals("#*:*") ||
561 header.equals("#*:**") ||
562 header.equals("#*:#") ||
563 header.equals("#*:*:") ||
564 header.equals("#*:*") ||
565 header.equals(";")) {
566 // might have: * {{seeCites}}
567 // * [[w:Arabic numerals|Arabic numerals]]: 2
568 //assert item.trim().length() == 0;
569 System.err.println("Skipping meaning: " + header + " " + item);
571 if (title.equals("Yellowknife")) {
574 System.err.println("Busted heading: " + title + " "+ header + " " + item);
581 if (currentTranslationSense != null) {
582 if (item.indexOf("{{[trreq]{}}}") != -1) {
586 if (currentPartOfSpeech.translationSenses.isEmpty()) {
587 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
590 final int colonPos = item.indexOf(':');
591 if (colonPos == -1) {
592 System.err.println("Invalid translation: title=" + title + ", item=" + item);
595 final String lang = item.substring(0, colonPos);
596 final String trans = item.substring(colonPos + 1).trim();
597 for (int i = 0; i < 2; ++i) {
598 if (langPatterns[i].matcher(lang).find()) {
599 currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
606 public void onNewLine() {
610 public void onNewParagraph() {
613 // ----------------------------------------------------------------------
616 public void onComment(String text) {
620 public void onFormatBold(boolean boldOn) {
624 public void onFormatItalic(boolean italicOn) {
628 public void onUnterminated(String start, String rest) {
629 throw new RuntimeException(start + rest);
632 public void onInvalidHeaderEnd(String rest) {
633 throw new RuntimeException(rest);