1 package com.hughes.android.dictionary.parser;
4 import java.io.IOException;
5 import java.util.ArrayList;
7 import java.util.regex.Pattern;
9 import javax.xml.parsers.ParserConfigurationException;
10 import javax.xml.parsers.SAXParser;
11 import javax.xml.parsers.SAXParserFactory;
13 import org.xml.sax.Attributes;
14 import org.xml.sax.SAXException;
16 import com.hughes.android.dictionary.engine.DictionaryBuilder;
17 import com.hughes.android.dictionary.engine.IndexBuilder;
19 public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
21 final DictionaryBuilder dict;
23 final IndexBuilder[] indexBuilders;
24 final Pattern[] langPatterns;
26 StringBuilder titleBuilder;
27 StringBuilder textBuilder;
28 StringBuilder currentBuilder = null;
30 public EnWiktionaryXmlParser(final DictionaryBuilder builder, final Pattern[] langPatterns, final int enIndexBuilder) {
31 assert langPatterns.length == 2;
33 this.indexBuilders = dict.indexBuilders.toArray(new IndexBuilder[0]);
34 this.langPatterns = langPatterns;
38 public void startElement(String uri, String localName, String qName,
39 Attributes attributes) {
40 currentBuilder = null;
41 if ("page".equals(qName)) {
42 titleBuilder = new StringBuilder();
44 // Start with "\n" to better match certain strings.
45 textBuilder = new StringBuilder("\n");
46 } else if ("title".equals(qName)) {
47 currentBuilder = titleBuilder;
48 } else if ("text".equals(qName)) {
49 currentBuilder = textBuilder;
54 public void characters(char[] ch, int start, int length) throws SAXException {
55 if (currentBuilder != null) {
56 currentBuilder.append(ch, start, length);
61 public void endElement(String uri, String localName, String qName)
63 currentBuilder = null;
64 if ("page".equals(qName)) {
70 public void parse(final File file) throws ParserConfigurationException,
71 SAXException, IOException {
72 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
73 parser.parse(file, this);
76 private void endPage() {
77 title = titleBuilder.toString();
80 WikiParser.parse(textBuilder.toString(), this);
84 * Two things can happen:
86 * We can be in a ==German== section. There we will see English definitions.
87 * Each POS should get its own QuickDic entry. Pretty much everything goes
90 * Or we can be in an ==English== section with English definitions
91 * and maybe see translations for languages we care about.
93 * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
94 * into separate QuickDic entries, but that's tricky--how do we know when we
95 * found a subsection? Just ignore anything containing pronunciation and
98 * How do we decide when to seal the deal on an entry?
100 * Would be nice if the parser told us about leaving sections....
107 final List<WikiWord> words = new ArrayList<WikiWord>();
108 WikiWord currentWord;
109 WikiWord.PartOfSpeech currentPartOfSpeech;
110 WikiWord.TranslationSection currentTranslationSection;
112 StringBuilder wikiBuilder = null;
114 // ------------------------------------------------------------------------
117 public void onWikiLink(String[] args) {
118 if (wikiBuilder != null) {
119 wikiBuilder.append(args[args.length - 1]);
124 public void onTemplate(String[][] args) {
125 final String name = args[0][1];
129 //System.out.println("Unhandled template: " + name);
134 public void onText(String text) {
135 if (wikiBuilder != null) {
136 wikiBuilder.append(text);
142 public void onHeadingStart(int depth) {
143 wikiBuilder = new StringBuilder();
144 currentDepth = depth;
145 if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
146 currentPartOfSpeech = null;
148 if (currentWord != null && depth <= currentWord.depth) {
153 final Pattern partOfSpeechHeader = Pattern.compile(
154 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
155 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
156 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
157 "Ligature|Idiom|Phrase|" +
158 // These are @deprecated:
159 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
160 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
163 public void onHeadingEnd(int depth) {
164 final String name = wikiBuilder.toString().trim();
167 final boolean lang1 = langPatterns[0].matcher(name).matches();
168 final boolean lang2 = langPatterns[1].matcher(name).matches();
169 if (name.equalsIgnoreCase("English") || lang1 || lang2) {
170 currentWord = new WikiWord(depth);
171 currentWord.language = name;
172 currentWord.isLang1 = lang1;
173 currentWord.isLang2 = lang2;
174 words.add(currentWord);
178 if (currentWord == null) {
182 if (partOfSpeechHeader.matcher(name).matches()) {
183 currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
184 currentWord.partsOfSpeech.add(currentPartOfSpeech);
188 if (name.equals("Translations")) {
189 if (currentWord == null ||
190 !currentWord.language.equals("English") ||
191 currentPartOfSpeech == null) {
192 System.out.println("Unexpected Translations section: " + title);
195 currentTranslationSection = new WikiWord.TranslationSection();
196 currentPartOfSpeech.translationSections.add(currentTranslationSection);
198 currentTranslationSection = null;
203 public void onListItemStart(String header, int[] section) {
204 wikiBuilder = new StringBuilder();
209 public void onListItemEnd(String header, int[] section) {
210 final String item = wikiBuilder.toString();
213 if (currentTranslationSection != null) {
214 final int colonPos = item.indexOf(':');
215 if (colonPos == -1) {
216 System.out.println("Invalid translation: " + item);
219 final String lang = item.substring(0, colonPos);
220 final String trans = item.substring(colonPos + 1);
221 for (int i = 0; i < 2; ++i) {
222 if (langPatterns[i].matcher(lang).find()) {
223 currentTranslationSection.translations.get(i).add(trans);
230 public void onNewLine() {
234 public void onNewParagraph() {
237 // ----------------------------------------------------------------------
239 public void onTransTrop(final String[][] args) {
240 currentTranslationSection = new WikiWord.TranslationSection();
241 currentPartOfSpeech.translationSections.add(currentTranslationSection);
243 if (args.length > 1) {
244 currentTranslationSection.sense = args[1][1];
249 // ----------------------------------------------------------------------
252 public void onComment(String text) {
256 public void onFormatBold(boolean boldOn) {
260 public void onFormatItalic(boolean italicOn) {
264 public void onUnterminated(String start, String rest) {
265 throw new RuntimeException(rest);
268 public void onInvalidHeaderEnd(String rest) {
269 throw new RuntimeException(rest);