1 package com.hughes.android.dictionary.parser;
3 import java.io.BufferedInputStream;
4 import java.io.DataInputStream;
5 import java.io.EOFException;
7 import java.io.FileInputStream;
8 import java.io.IOException;
9 import java.util.Arrays;
10 import java.util.LinkedHashSet;
12 import java.util.regex.Pattern;
14 import com.hughes.android.dictionary.engine.DictionaryBuilder;
15 import com.hughes.android.dictionary.engine.EntryTypeName;
16 import com.hughes.android.dictionary.engine.IndexBuilder;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.engine.PairEntry;
19 import com.hughes.android.dictionary.engine.PairEntry.Pair;
21 public class EnWiktionaryXmlParser {
23 static final Pattern partOfSpeechHeader = Pattern.compile(
24 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
25 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
26 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
27 "Ligature|Idiom|Phrase|" +
28 // These are @deprecated:
29 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
30 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
31 // These are extras I found:
32 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
33 "Particle|Interjection|Pronominal adverb" +
34 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
36 final IndexBuilder enIndexBuilder;
37 final IndexBuilder otherIndexBuilder;
38 final Pattern langPattern;
39 final Pattern langCodePattern;
42 public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
43 this.enIndexBuilder = enIndexBuilder;
44 this.otherIndexBuilder = otherIndexBuilder;
45 this.langPattern = langPattern;
46 this.langCodePattern = langCodePattern;
51 public void parse(final File file, final int pageLimit) throws IOException {
53 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
55 if (pageLimit >= 0 && pageCount >= pageLimit) {
61 title = dis.readUTF();
62 } catch (EOFException e) {
66 final String heading = dis.readUTF();
67 final int bytesLength = dis.readInt();
68 final byte[] bytes = new byte[bytesLength];
70 final String text = new String(bytes, "UTF8");
72 parseSection(title, heading, text);
75 if (pageCount % 1000 == 0) {
76 System.out.println("pageCount=" + pageCount);
81 private void parseSection(final String title, final String heading, final String text) {
82 if (title.startsWith("Wiktionary:") ||
83 title.startsWith("Template:") ||
84 title.startsWith("Appendix:") ||
85 title.startsWith("Category:") ||
86 title.startsWith("Index:") ||
87 title.startsWith("MediaWiki:") ||
88 title.startsWith("TransWiki:") ||
89 title.startsWith("Citations:") ||
90 title.startsWith("Concordance:") ||
91 title.startsWith("Help:")) {
95 if (heading.replaceAll("=", "").equals("English")) {
96 doEnglishWord(title, text);
98 doForeignWord(title, text);
103 // -------------------------------------------------------------------------
108 private void doEnglishWord(String title, String text) {
109 final WikiLineReader wikiLineReader = new WikiLineReader(text);
111 while ((line = wikiLineReader.readLine()) != null) {
112 final WikiHeading wikiHeading = WikiHeading.getHeading(line);
113 if (wikiHeading != null) {
115 if (wikiHeading.depth <= posDepth) {
120 if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
121 posDepth = wikiHeading.depth;
122 pos = wikiHeading.name;
123 } else if (wikiHeading.name.equals("Translations")) {
124 doTranslations(title, wikiLineReader);
125 } else if (wikiHeading.name.equals("Pronunciation")) {
126 //doPronunciation(wikiLineReader);
133 private static Set<String> encodings = new LinkedHashSet<String>(Arrays.asList("zh-ts",
134 "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
135 "fa-Arab", "Khmr", "zh-tsp", "Cyrl", "IPAchar", "ug-Arab", "ko-inline",
136 "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
137 "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
139 private void doTranslations(final String title, final WikiLineReader wikiLineReader) {
142 boolean done = false;
143 while ((line = wikiLineReader.readLine()) != null) {
144 if (WikiHeading.getHeading(line) != null) {
145 wikiLineReader.stuffLine(line);
152 // Check whether we care about this line:
154 //line = WikiLineReader.removeSquareBrackets(line);
156 if (line.startsWith("{{")) {
158 WikiFunction wikiFunction;
159 while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
160 if (wikiFunction.name.equals("trans-top")) {
162 if (wikiFunction.args.size() >= 1) {
163 sense = wikiFunction.args.get(0);
164 //System.out.println("Sense: " + sense);
166 } else if (wikiFunction.name.equals("trans-bottom")) {
168 } else if (wikiFunction.name.equals("trans-mid")) {
169 } else if (wikiFunction.name.equals("trans-see")) {
170 } else if (wikiFunction.name.startsWith("checktrans")) {
173 System.err.println("Unexpected translation wikifunction: " + line + ", title=" + title);
175 line = wikiFunction.replaceWith(line, "");
179 } else if (line.startsWith("*")) {
180 // This line could produce an output...
182 // First strip the language and check whether it matches.
183 // And hold onto it for sub-lines.
184 final int colonIndex = line.indexOf(":");
185 if (colonIndex == -1) {
189 final String lang = line.substring(0, colonIndex);
190 if (!this.langPattern.matcher(lang).find()) {
194 String rest = line.substring(colonIndex + 1).trim();
195 doTranslationLine(line, title, sense, rest);
197 } else if (line.equals("")) {
198 } else if (line.startsWith(":")) {
199 } else if (line.startsWith("[[") && line.endsWith("]]")) {
200 } else if (line.startsWith("''See''")) {
201 } else if (line.startsWith("''")) {
202 } else if (line.equals("----")) {
204 System.err.println("Unexpected translation line: " + line + ", title=" + title);
211 private void doTranslationLine(final String line, final String title, final String sense, String rest) {
213 // Good chance we'll actually file this one...
214 final PairEntry pairEntry = new PairEntry();
215 final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
217 final StringBuilder otherText = new StringBuilder();
219 WikiFunction wikiFunction;
220 while ((wikiFunction = WikiFunction.getFunction(rest)) != null) {
221 if (wikiFunction.start > 0) {
222 String plainText = rest.substring(0, wikiFunction.start);
223 otherText.append("").append(plainText);
224 otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
226 rest = rest.substring(wikiFunction.end);
228 if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
229 if (wikiFunction.args.size() < 2) {
230 System.err.println("{{t}} with too few args: " + line + ", title=" + title);
233 final String langCode = wikiFunction.getArg(0);
234 if (this.langCodePattern.matcher(langCode).matches()) {
235 final String word = wikiFunction.getArg(1);
236 final String gender = wikiFunction.getArg(2);
237 final String transliteration = wikiFunction.getNamedArg("tr");
238 if (otherText.length() > 0) {
239 otherText.append("");
241 otherText.append(word);
242 otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
243 if (gender != null) {
244 otherText.append(String.format(" {%s}", gender));
246 if (transliteration != null) {
247 otherText.append(String.format(" (tr. %s)", transliteration));
248 otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
251 } else if (wikiFunction.name.equals("qualifier")) {
252 String qualifier = wikiFunction.getArg(0);
253 if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) {
254 System.err.println("weird qualifier: " + line);
256 otherText.append("(").append(qualifier).append(")");
257 } else if (encodings.contains(wikiFunction.name)) {
258 otherText.append("").append(wikiFunction.getArg(0));
259 otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
260 } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
261 otherText.append("{");
262 otherText.append(wikiFunction.name);
263 for (int i = 0; i < wikiFunction.args.size(); ++i) {
264 otherText.append("|").append(wikiFunction.getArg(i));
266 otherText.append("}");
267 } else if (wikiFunction.name.equals("g")) {
268 otherText.append("{g}");
269 } else if (wikiFunction.name.equals("l")) {
270 // encodes text in various langs.
272 otherText.append("").append(wikiFunction.getArg(1));
273 otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT);
274 // TODO: transliteration
275 } else if (wikiFunction.name.equals("term")) {
276 // cross-reference to another dictionary
277 otherText.append("").append(wikiFunction.getArg(0));
278 otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
279 // TODO: transliteration
280 } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
281 // TODO: put this text aside to use it.
282 otherText.append("[").append(wikiFunction.getArg(0)).append("]");
283 otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
284 } else if (wikiFunction.name.equals("ttbc")) {
285 } else if (wikiFunction.name.equals("trreq")) {
286 } else if (wikiFunction.name.equals("not used")) {
287 otherText.append("(not used)");
288 } else if (wikiFunction.name.equals("t-image")) {
289 // American sign language
290 } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
291 otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}");
293 System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
296 String plainText = rest;
297 otherText.append("").append(plainText);
298 otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
300 StringBuilder englishText = new StringBuilder();
302 englishText.append(title);
304 englishText.append(" (").append(sense).append(")");
305 enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
308 englishText.append(" (").append(pos.toLowerCase()).append(")");
310 enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
312 final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap);
313 pairEntry.pairs.add(pair);
314 assert (pairsAdded.add(pair.toString()));
315 if (pair.toString().equals("libero {m} :: free (adjective)")) {
316 System.out.println();
321 Set<String> pairsAdded = new LinkedHashSet<String>();
323 // -------------------------------------------------------------------------
325 private void doForeignWord(String title, String text) {
326 final WikiLineReader wikiLineReader = new WikiLineReader(text);
328 while ((line = wikiLineReader.readLine()) != null) {
329 final WikiHeading wikiHeading = WikiHeading.getHeading(line);
330 if (wikiHeading != null) {
331 if (wikiHeading.name.equals("Translations")) {
332 System.err.println("Translations not in English section: " + title);
333 } else if (wikiHeading.name.equals("Pronunciation")) {
334 //doPronunciation(wikiLineReader);
335 } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
336 doPartOfSpeech(title, wikiHeading, wikiLineReader);
343 private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) {
345 System.out.println("***" + title);
346 System.out.println(posHeading.name);
347 while ((line = wikiLineReader.readLine()) != null) {
348 WikiHeading heading = WikiHeading.getHeading(line);
349 if (heading != null) {
350 if (heading.depth <= posHeading.depth) {
351 wikiLineReader.stuffLine(line);
355 System.out.println(line);