1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.LinkedHashSet;
22 import java.util.List;
25 import java.util.regex.Pattern;
27 import javax.xml.parsers.ParserConfigurationException;
28 import javax.xml.parsers.SAXParser;
29 import javax.xml.parsers.SAXParserFactory;
31 import org.xml.sax.Attributes;
32 import org.xml.sax.SAXException;
34 import com.hughes.android.dictionary.engine.DictionaryBuilder;
35 import com.hughes.android.dictionary.engine.IndexBuilder;
36 import com.hughes.android.dictionary.parser.WikiWord.FormOf;
37 import com.hughes.android.dictionary.parser.WikiWord.Translation;
38 import com.hughes.util.ListUtil;
39 import com.hughes.util.StringUtil;
41 public class EnWiktionaryXmlParserOld extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
43 static final Pattern partOfSpeechHeader = Pattern.compile(
44 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
45 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
46 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
47 "Ligature|Idiom|Phrase|" +
48 // These are @deprecated:
49 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
50 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
51 // These are extras I found:
52 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
53 "Particle|Interjection|Pronominal adverb" +
54 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
56 static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
58 final DictionaryBuilder dictBuilder;
60 final IndexBuilder[] indexBuilders;
61 final Pattern[] langPatterns;
62 final int enIndexBuilder;
64 StringBuilder titleBuilder;
65 StringBuilder textBuilder;
66 StringBuilder currentBuilder = null;
68 static void assertTrue(final boolean condition) {
69 assertTrue(condition, "");
72 static void assertTrue(final boolean condition, final String message) {
74 System.err.println("Assertion failed, message: " + message);
75 new RuntimeException().printStackTrace(System.err);
79 public EnWiktionaryXmlParserOld(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
80 assertTrue(langPatterns.length == 2);
81 this.dictBuilder = dictBuilder;
82 this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
83 this.langPatterns = langPatterns;
84 this.enIndexBuilder = enIndexBuilder;
88 public void startElement(String uri, String localName, String qName,
89 Attributes attributes) {
90 currentBuilder = null;
91 if ("page".equals(qName)) {
92 titleBuilder = new StringBuilder();
94 // Start with "\n" to better match certain strings.
95 textBuilder = new StringBuilder("\n");
96 } else if ("title".equals(qName)) {
97 currentBuilder = titleBuilder;
98 } else if ("text".equals(qName)) {
99 currentBuilder = textBuilder;
104 public void characters(char[] ch, int start, int length) throws SAXException {
105 if (currentBuilder != null) {
106 currentBuilder.append(ch, start, length);
111 public void endElement(String uri, String localName, String qName)
112 throws SAXException {
113 currentBuilder = null;
114 if ("page".equals(qName)) {
120 public void parse(final File file) throws ParserConfigurationException,
121 SAXException, IOException {
122 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
123 parser.parse(file, this);
127 private void endPage() {
128 title = titleBuilder.toString();
130 if (pageCount % 1000 == 0) {
131 System.out.println("pageCount=" + pageCount);
133 if (title.startsWith("Wiktionary:") ||
134 title.startsWith("Template:") ||
135 title.startsWith("Appendix:") ||
136 title.startsWith("Category:") ||
137 title.startsWith("Index:") ||
138 title.startsWith("MediaWiki:") ||
139 title.startsWith("TransWiki:") ||
140 title.startsWith("Citations:") ||
141 title.startsWith("Concordance:") ||
142 title.startsWith("Help:")) {
147 currentHeading = null;
148 insidePartOfSpeech = false;
149 // System.err.println("Working on page: " + title);
151 WikiParser.parse(textBuilder.toString(), this);
152 } catch (Throwable e) {
153 System.err.println("Failure on page: " + title);
154 e.printStackTrace(System.err);
157 for (final WikiWord word : words) {
158 word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
164 // ------------------------------------------------------------------------
165 // ------------------------------------------------------------------------
166 // ------------------------------------------------------------------------
167 // ------------------------------------------------------------------------
170 * Two things can happen:
172 * We can be in a ==German== section. There we will see English definitions.
173 * Each POS should get its own QuickDic entry. Pretty much everything goes
176 * Or we can be in an ==English== section with English definitions
177 * and maybe see translations for languages we care about.
179 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
180 * into separate QuickDic entries, but that's tricky--how do we know when we
181 * found a subsection? Just ignore anything containing pronunciation and
184 * How do we decide when to seal the deal on an entry?
186 * Would be nice if the parser told us about leaving sections....
192 String currentHeading;
194 final List<WikiWord> words = new ArrayList<WikiWord>();
195 WikiWord currentWord;
196 WikiWord.PartOfSpeech currentPartOfSpeech;
197 WikiWord.TranslationSense currentTranslationSense;
198 boolean insidePartOfSpeech;
200 StringBuilder wikiBuilder = null;
203 public void onWikiLink(String[] args) {
204 if (wikiBuilder == null) {
207 wikiBuilder.append(args[args.length - 1]);
210 // ttbc: translations to be checked.
211 static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
212 "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
213 "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
214 "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
215 static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
216 static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
217 static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
220 public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
221 if (positionalArgs.isEmpty()) {
222 // This happens very rarely with special templates.
225 final String name = positionalArgs.get(0);
227 namedArgs.remove("lang");
228 namedArgs.remove("nocat");
229 namedArgs.remove("nocap");
230 namedArgs.remove("sc");
233 if (currentWord != null) {
234 if (name.equals("a")) {
236 currentWord.currentPronunciation = new StringBuilder();
237 currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
241 if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
242 namedArgs.remove("lang");
243 for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
244 final String pron = namedArgs.remove("" + i);
246 positionalArgs.add(pron);
253 if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
254 System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
256 if (currentWord.currentPronunciation == null) {
257 currentWord.currentPronunciation = new StringBuilder();
258 currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
260 if (currentWord.currentPronunciation.length() > 0) {
261 currentWord.currentPronunciation.append("; ");
263 for (int i = 1; i < positionalArgs.size(); ++i) {
265 currentWord.currentPronunciation.append(",");
267 final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
268 currentWord.currentPronunciation.append(pron).append("");
270 currentWord.currentPronunciation.append(" (").append(name).append(")");
274 if (name.equals("qualifier")) {
275 //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString());
276 if (wikiBuilder == null) {
279 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
283 if (name.equals("...")) {
284 // Skipping any elided text for brevity.
285 wikiBuilder.append("...");
289 if (passThroughTemplates.contains(name)) {
290 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs);
291 wikiBuilder.append(name);
295 if (ignoreTemplates.contains(name)) {
299 if ("Pronunciation".equals(currentHeading)) {
300 System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
306 if (insidePartOfSpeech) {
309 if (name.equals("form of")) {
310 namedArgs.remove("sc");
311 if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
312 System.err.println("Invalid form of.");
314 final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
315 final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
316 currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
320 // The fallback plan: append the template!
321 if (wikiBuilder != null) {
322 wikiBuilder.append("{");
323 boolean first = true;
324 for (final String arg : positionalArgs) {
326 wikiBuilder.append(", ");
329 wikiBuilder.append(arg);
331 // This one isn't so useful.
332 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
334 wikiBuilder.append(", ");
337 wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
339 wikiBuilder.append("}");
342 //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
348 if (name.equals("trans-top")) {
349 assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title);
351 if (currentPartOfSpeech == null) {
352 assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(), title);
353 System.err.println("Assuming last part of speech for non-nested translation section: " + title);
354 currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
357 currentTranslationSense = new WikiWord.TranslationSense();
358 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
359 if (positionalArgs.size() > 1) {
360 currentTranslationSense.sense = positionalArgs.get(1);
365 if (wikiBuilder == null) {
368 if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
369 assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString());
370 wikiBuilder.append("{");
371 for (int i = 1; i < positionalArgs.size(); ++i) {
372 wikiBuilder.append(i > 1 ? "," : "");
373 wikiBuilder.append(positionalArgs.get(i));
375 wikiBuilder.append(name).append("}");
377 } else if (name.equals("p")) {
378 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty());
379 wikiBuilder.append("pl.");
381 } else if (name.equals("s")) {
382 assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title);
383 wikiBuilder.append("sg.");
385 } else if (grammarTemplates.contains(name)) {
386 assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
387 wikiBuilder.append(name).append(".");
389 } else if (name.equals("l")) {
390 // This template is designed to generate a link to a specific language-section on the target page.
391 wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
393 } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
394 if (positionalArgs.size() > 2) {
395 wikiBuilder.append(positionalArgs.get(2));
397 for (int i = 3; i < positionalArgs.size(); ++i) {
398 wikiBuilder.append(i == 3 ? " {" : ",");
399 wikiBuilder.append(positionalArgs.get(i));
400 wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
402 final String transliteration = namedArgs.remove("tr");
403 if (transliteration != null) {
404 wikiBuilder.append(" (").append(transliteration).append(")");
407 } else if (name.equals("trreq")) {
408 wikiBuilder.append("{{trreq}}");
410 } else if (name.equals("qualifier")) {
411 //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
412 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
414 } else if (useRemainingArgTemplates.contains(name)) {
415 for (int i = 1; i < positionalArgs.size(); ++i) {
417 wikiBuilder.append(", ");
419 wikiBuilder.append(positionalArgs.get(i));
421 } else if (ignoreTemplates.contains(name)) {
424 } else if (name.equals("initialism")) {
425 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
426 wikiBuilder.append("Initialism");
427 } else if (name.equals("abbreviation")) {
428 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
429 wikiBuilder.append("Abbreviation");
430 } else if (name.equals("acronym")) {
431 assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
432 wikiBuilder.append("Acronym");
434 if (currentTranslationSense != null) {
435 System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
441 public void onText(String text) {
442 if (wikiBuilder != null) {
443 wikiBuilder.append(text);
449 public void onHeadingStart(int depth) {
450 wikiBuilder = new StringBuilder();
451 currentDepth = depth;
452 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
453 currentPartOfSpeech = null;
454 insidePartOfSpeech = false;
456 if (currentWord != null && depth <= currentWord.depth) {
460 currentHeading = null;
464 public void onHeadingEnd(int depth) {
465 final String name = wikiBuilder.toString().trim();
467 currentTranslationSense = null;
468 currentHeading = name;
470 final boolean lang0 = langPatterns[0].matcher(name).matches();
471 final boolean lang1 = langPatterns[1].matcher(name).matches();
472 if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
473 currentWord = new WikiWord(title, depth);
474 if (lang0 && lang1) {
475 System.err.println("Word is indexed in both index1 and index2: " + title);
477 currentWord.language = name;
478 currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
479 words.add(currentWord);
483 if (currentWord == null) {
487 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
488 currentPartOfSpeech = null;
491 insidePartOfSpeech = false;
492 if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
493 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
494 currentWord.partsOfSpeech.add(currentPartOfSpeech);
495 insidePartOfSpeech = true;
499 if (name.equals("Translations")) {
500 if (currentWord == null ||
501 !currentWord.language.equals("English") ||
502 currentPartOfSpeech == null) {
503 System.err.println("Unexpected Translations section: " + title);
506 currentTranslationSense = new WikiWord.TranslationSense();
512 public void onListItemStart(String header, int[] section) {
513 wikiBuilder = new StringBuilder();
514 if (currentWord != null) {
515 currentWord.currentPronunciation = null;
521 public void onListItemEnd(String header, int[] section) {
522 String item = wikiBuilder.toString().trim();
523 if (item.length() == 0) {
526 item = WikiParser.simpleParse(item);
530 if (insidePartOfSpeech) {
531 assert currentPartOfSpeech != null : title + item;
532 if (header.equals("#") ||
533 header.equals("##") ||
534 header.equals("###") ||
535 header.equals("####") ||
536 header.equals(":#") ||
537 header.equals("::") ||
538 header.equals(":::*")) {
540 // :: should append, probably.
541 currentPartOfSpeech.newMeaning().meaning = item;
544 } else if (header.equals("#*") ||
545 header.equals("##*") ||
546 header.equals("###*")) {
547 currentPartOfSpeech.lastMeaning().newExample().source = item;
550 } else if (header.equals("#:") ||
551 header.equals("#*:") ||
552 header.equals("#:*") ||
553 header.equals("##:") ||
554 header.equals("##*:") ||
555 header.equals("#:*:") ||
556 header.equals("#:*#") ||
557 header.equals("#*:") ||
558 header.equals("*:") ||
559 header.equals("#:::") ||
560 header.equals("#**") ||
561 header.equals("#*:::") ||
562 header.equals("#:#") ||
563 header.equals(":::") ||
564 header.equals("##:*") ||
565 header.equals("###*:")) {
566 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
568 // Example in English
569 } else if (header.equals("#::") ||
570 header.equals("#*::") ||
571 header.equals("#:**") ||
572 header.equals("#*#") ||
573 header.equals("##*::")) {
574 StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
577 } else if (header.equals("*") ||
578 header.equals("**") ||
579 header.equals("***") ||
580 header.equals("*#") ||
581 header.equals(":") ||
582 header.equals("::*") ||
583 header.equals("#**") ||
584 header.equals(":*") ||
585 header.equals("#*:*") ||
586 header.equals("#*:**") ||
587 header.equals("#*:#") ||
588 header.equals("#*:*:") ||
589 header.equals("#*:*") ||
590 header.equals(";")) {
591 // might have: * {{seeCites}}
592 // * [[w:Arabic numerals|Arabic numerals]]: 2
593 //assert item.trim().length() == 0;
594 System.err.println("Skipping meaning: " + header + " " + item);
596 if (title.equals("Yellowknife")) {
599 System.err.println("Busted heading: " + title + " "+ header + " " + item);
606 if (currentTranslationSense != null) {
607 if (item.indexOf("{{[trreq]{}}}") != -1) {
611 if (currentPartOfSpeech.translationSenses.isEmpty()) {
612 currentPartOfSpeech.translationSenses.add(currentTranslationSense);
615 final int colonPos = item.indexOf(':');
616 if (colonPos == -1) {
617 System.err.println("Invalid translation: title=" + title + ", item=" + item);
620 final String lang = item.substring(0, colonPos);
621 final String trans = item.substring(colonPos + 1).trim();
622 for (int i = 0; i < 2; ++i) {
623 if (langPatterns[i].matcher(lang).find()) {
624 currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
631 public void onNewLine() {
635 public void onNewParagraph() {
638 // ----------------------------------------------------------------------
641 public void onComment(String text) {
645 public void onFormatBold(boolean boldOn) {
649 public void onFormatItalic(boolean italicOn) {
653 public void onUnterminated(String start, String rest) {
654 System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
657 public void onInvalidHeaderEnd(String rest) {
658 throw new RuntimeException(rest);