1 package com.hughes.android.dictionary.parser;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.LinkedHashSet;
11 import java.util.regex.Pattern;
13 import javax.xml.parsers.ParserConfigurationException;
14 import javax.xml.parsers.SAXParser;
15 import javax.xml.parsers.SAXParserFactory;
17 import org.xml.sax.Attributes;
18 import org.xml.sax.SAXException;
20 import com.hughes.android.dictionary.engine.DictionaryBuilder;
21 import com.hughes.android.dictionary.engine.IndexBuilder;
22 import com.hughes.android.dictionary.parser.WikiWord.TranslationSection;
24 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
26 static final Pattern partOfSpeechHeader = Pattern.compile(
27 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
28 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
29 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
30 "Ligature|Idiom|Phrase|" +
31 // These are @deprecated:
32 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
33 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
35 static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
38 final DictionaryBuilder dict;
40 final IndexBuilder[] indexBuilders;
41 final Pattern[] langPatterns;
43 StringBuilder titleBuilder;
44 StringBuilder textBuilder;
45 StringBuilder currentBuilder = null;
47 public EnWiktionaryXmlParser(final DictionaryBuilder builder, final Pattern[] langPatterns, final int enIndexBuilder) {
48 assert langPatterns.length == 2;
50 this.indexBuilders = dict.indexBuilders.toArray(new IndexBuilder[0]);
51 this.langPatterns = langPatterns;
55 public void startElement(String uri, String localName, String qName,
56 Attributes attributes) {
57 currentBuilder = null;
58 if ("page".equals(qName)) {
59 titleBuilder = new StringBuilder();
61 // Start with "\n" to better match certain strings.
62 textBuilder = new StringBuilder("\n");
63 } else if ("title".equals(qName)) {
64 currentBuilder = titleBuilder;
65 } else if ("text".equals(qName)) {
66 currentBuilder = textBuilder;
71 public void characters(char[] ch, int start, int length) throws SAXException {
72 if (currentBuilder != null) {
73 currentBuilder.append(ch, start, length);
78 public void endElement(String uri, String localName, String qName)
80 currentBuilder = null;
81 if ("page".equals(qName)) {
87 public void parse(final File file) throws ParserConfigurationException,
88 SAXException, IOException {
89 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
90 parser.parse(file, this);
93 private void endPage() {
94 title = titleBuilder.toString();
97 currentHeading = null;
98 WikiParser.parse(textBuilder.toString(), this);
100 for (final WikiWord word : words) {
101 System.out.println("\n" + title + ", " + word.language + ", pron=" + word.accentToPronunciation);
102 if (word.partsOfSpeech.isEmpty() && title.indexOf(":") == -1) {
103 System.err.println("Word with no POS: " + title);
105 for (final WikiWord.PartOfSpeech partOfSpeech : word.partsOfSpeech) {
106 System.out.println(" pos: " + partOfSpeech.name);
108 for (final TranslationSection translationSection : partOfSpeech.translationSections) {
109 System.out.println(" sense: " + translationSection.sense);
117 // ------------------------------------------------------------------------
118 // ------------------------------------------------------------------------
119 // ------------------------------------------------------------------------
120 // ------------------------------------------------------------------------
123 * Two things can happen:
125 * We can be in a ==German== section. There we will see English definitions.
126 * Each POS should get its own QuickDic entry. Pretty much everything goes
129 * Or we can be in an ==English== section with English definitions
130 * and maybe see translations for languages we care about.
132 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
133 * into separate QuickDic entries, but that's tricky--how do we know when we
134 * found a subsection? Just ignore anything containing pronunciation and
137 * How do we decide when to seal the deal on an entry?
139 * Would be nice if the parser told us about leaving sections....
145 String currentHeading;
147 final List<WikiWord> words = new ArrayList<WikiWord>();
148 WikiWord currentWord;
149 WikiWord.PartOfSpeech currentPartOfSpeech;
150 WikiWord.TranslationSection currentTranslationSection;
152 StringBuilder wikiBuilder = null;
155 public void onWikiLink(String[] args) {
156 if (wikiBuilder == null) {
159 wikiBuilder.append(args[args.length - 1]);
162 // ttbc: translations to be checked.
163 static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
164 "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
165 "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
166 "zh-tsp", "zh-zh-p"));
167 static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
168 static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
171 public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
172 final String name = positionalArgs.get(0);
175 if (name.equals("a")) {
177 currentWord.currentPronunciation = new StringBuilder();
178 currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
181 if (name.equals("IPA") || name.equals("SAMPA") || name.equals("enPR") || name.equals("rhymes")) {
182 namedArgs.remove("lang");
183 assert positionalArgs.size() >= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
184 if (currentWord.currentPronunciation == null) {
185 currentWord.currentPronunciation = new StringBuilder();
186 currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
188 currentWord.currentPronunciation.append(name).append(": ");
189 for (int i = 1; i < positionalArgs.size(); ++i) {
191 currentWord.currentPronunciation.append(", ");
193 final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
194 currentWord.currentPronunciation.append(pron).append("");
198 if (name.equals("audio")) {
201 if ("Pronunciation".equals(currentHeading)) {
202 System.err.println("Unhandled template: " + name);
206 if (name.equals("trans-top")) {
207 assert positionalArgs.size() == 2 && namedArgs.isEmpty();
208 currentTranslationSection = new WikiWord.TranslationSection();
209 currentPartOfSpeech.translationSections.add(currentTranslationSection);
210 if (positionalArgs.size() > 1) {
211 currentTranslationSection.sense = positionalArgs.get(1);
216 if (wikiBuilder == null) {
220 } else if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
221 wikiBuilder.append("{").append(name).append("}");
222 } else if (name.equals("p")) {
223 wikiBuilder.append("pl.");
224 } else if (name.equals("s")) {
225 wikiBuilder.append("sg.");
226 } else if (grammarTemplates.contains(name)) {
227 wikiBuilder.append(name).append(".");
228 } else if (name.equals("l")) {
229 wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
230 } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
231 if (positionalArgs.size() >= 2) {
232 wikiBuilder.append(positionalArgs.get(1));
234 if (positionalArgs.size() >= 3) {
235 wikiBuilder.append(" {").append(positionalArgs.get(1)).append("}");
237 final String transliteration = namedArgs.remove("tr");
238 if (transliteration != null) {
239 wikiBuilder.append(" (").append(transliteration).append(")");
241 } else if (name.equals("trreq")) {
242 wikiBuilder.append("{{trreq}}");
243 } else if (name.equals("qualifier")) {
244 wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
245 } else if (useRemainingArgTemplates.contains(name)) {
246 for (int i = 1; i < positionalArgs.size(); ++i) {
248 wikiBuilder.append(", ");
250 wikiBuilder.append(positionalArgs.get(i));
252 } else if (ignoreTemplates.contains(name)) {
253 } else if (name.equals("initialism")) {
254 wikiBuilder.append("Initialism");
256 if (currentTranslationSection != null) {
257 System.err.println("Unhandled template: " + name);
263 public void onText(String text) {
264 if (wikiBuilder != null) {
265 wikiBuilder.append(text);
271 public void onHeadingStart(int depth) {
272 wikiBuilder = new StringBuilder();
273 currentDepth = depth;
274 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
275 currentPartOfSpeech = null;
277 if (currentWord != null && depth <= currentWord.depth) {
283 public void onHeadingEnd(int depth) {
284 final String name = wikiBuilder.toString().trim();
286 currentTranslationSection = null;
287 currentHeading = name;
289 final boolean lang1 = langPatterns[0].matcher(name).matches();
290 final boolean lang2 = langPatterns[1].matcher(name).matches();
291 if (name.equalsIgnoreCase("English") || lang1 || lang2) {
292 currentWord = new WikiWord(depth);
293 currentWord.language = name;
294 currentWord.isLang1 = lang1;
295 currentWord.isLang2 = lang2;
296 words.add(currentWord);
300 if (currentWord == null) {
304 if (partOfSpeechHeader.matcher(name).matches()) {
305 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
306 currentWord.partsOfSpeech.add(currentPartOfSpeech);
310 if (name.equals("Translations")) {
311 if (currentWord == null ||
312 !currentWord.language.equals("English") ||
313 currentPartOfSpeech == null) {
314 System.out.println("Unexpected Translations section: " + title);
317 currentTranslationSection = new WikiWord.TranslationSection();
318 currentPartOfSpeech.translationSections.add(currentTranslationSection);
321 if (name.equals("Translations")) {
322 if (currentWord == null ||
323 !currentWord.language.equals("English") ||
324 currentPartOfSpeech == null) {
325 System.out.println("Unexpected Translations section: " + title);
328 currentTranslationSection = new WikiWord.TranslationSection();
329 currentPartOfSpeech.translationSections.add(currentTranslationSection);
335 public void onListItemStart(String header, int[] section) {
336 wikiBuilder = new StringBuilder();
337 if (currentWord != null) {
338 currentWord.currentPronunciation = null;
344 public void onListItemEnd(String header, int[] section) {
345 final String item = wikiBuilder.toString();
348 if (item.indexOf("{{trreq}}") != -1) {
352 if (currentTranslationSection != null) {
353 final int colonPos = item.indexOf(':');
354 if (colonPos == -1) {
355 System.err.println("Invalid translation: " + item);
358 final String lang = item.substring(0, colonPos);
359 final String trans = item.substring(colonPos + 1);
360 for (int i = 0; i < 2; ++i) {
361 if (langPatterns[i].matcher(lang).find()) {
362 currentTranslationSection.translations.get(i).add(trans);
369 public void onNewLine() {
373 public void onNewParagraph() {
376 // ----------------------------------------------------------------------
379 public void onComment(String text) {
383 public void onFormatBold(boolean boldOn) {
387 public void onFormatItalic(boolean italicOn) {
391 public void onUnterminated(String start, String rest) {
392 throw new RuntimeException(rest);
395 public void onInvalidHeaderEnd(String rest) {
396 throw new RuntimeException(rest);