1 package com.hughes.android.dictionary.parser;
3 import java.io.BufferedInputStream;
4 import java.io.DataInputStream;
5 import java.io.EOFException;
7 import java.io.FileInputStream;
8 import java.io.IOException;
9 import java.util.Arrays;
10 import java.util.LinkedHashSet;
12 import java.util.regex.Pattern;
14 import com.hughes.android.dictionary.engine.DictionaryBuilder;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
17 public class EnWiktionaryXmlParser {
19 static final Pattern partOfSpeechHeader = Pattern.compile(
20 "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
21 "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
22 "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
23 "Ligature|Idiom|Phrase|" +
24 // These are @deprecated:
25 "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
26 "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
27 // These are extras I found:
28 "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
29 "Particle|Interjection|Pronominal adverb" +
30 "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
32 final DictionaryBuilder dictBuilder;
34 final IndexBuilder[] indexBuilders;
35 final Pattern langPattern;
36 final Pattern langCodePattern;
37 final int enIndexBuilder;
39 public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern langPattern, final Pattern langCodePattern, final int enIndexBuilder) {
40 this.dictBuilder = dictBuilder;
41 this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
42 this.langPattern = langPattern;
43 this.langCodePattern = langCodePattern;
44 this.enIndexBuilder = enIndexBuilder;
48 public void parse(final File file, final int pageLimit) throws IOException {
50 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
52 if (pageLimit >= 0 && pageCount >= pageLimit) {
58 title = dis.readUTF();
59 } catch (EOFException e) {
63 final String heading = dis.readUTF();
64 final int bytesLength = dis.readInt();
65 final byte[] bytes = new byte[bytesLength];
67 final String text = new String(bytes, "UTF8");
69 parseSection(title, heading, text);
72 if (pageCount % 1000 == 0) {
73 System.out.println("pageCount=" + pageCount);
78 private void parseSection(final String title, final String heading, final String text) {
79 if (title.startsWith("Wiktionary:") ||
80 title.startsWith("Template:") ||
81 title.startsWith("Appendix:") ||
82 title.startsWith("Category:") ||
83 title.startsWith("Index:") ||
84 title.startsWith("MediaWiki:") ||
85 title.startsWith("TransWiki:") ||
86 title.startsWith("Citations:") ||
87 title.startsWith("Concordance:") ||
88 title.startsWith("Help:")) {
92 if (heading.replaceAll("=", "").equals("English")) {
93 doEnglishWord(title, text);
95 //doForeignWord(title, text);
100 // -------------------------------------------------------------------------
105 private void doEnglishWord(String title, String text) {
106 final WikiLineReader wikiLineReader = new WikiLineReader(text);
108 while ((line = wikiLineReader.readLine()) != null) {
109 final WikiHeading wikiHeading = WikiHeading.getHeading(line);
110 if (wikiHeading != null) {
112 if (wikiHeading.depth <= posDepth) {
117 if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
118 posDepth = wikiHeading.depth;
119 pos = wikiHeading.name;
120 } else if (wikiHeading.name.equals("Translations")) {
121 doTranslations(title, wikiLineReader);
122 } else if (wikiHeading.name.equals("Pronunciation")) {
123 //doPronunciation(wikiLineReader);
130 private static Set<String> encodings = new LinkedHashSet<String>(Arrays.asList("zh-ts",
131 "sd-Arab", "ku-Arab", "Arab", "unicode", "Laoo", "ur-Arab", "Thai",
132 "fa-Arab", "Khmr", "zh-tsp", "Cyrl", "IPAchar", "ug-Arab", "ko-inline",
133 "Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
134 "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
136 private void doTranslations(final String title, final WikiLineReader wikiLineReader) {
139 boolean done = false;
140 while ((line = wikiLineReader.readLine()) != null) {
141 if (WikiHeading.getHeading(line) != null) {
142 wikiLineReader.stuffLine(line);
149 // Check whether we care about this line:
151 //line = WikiLineReader.removeSquareBrackets(line);
153 if (line.startsWith("{{")) {
155 WikiFunction wikiFunction;
156 while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
157 if (wikiFunction.name.equals("trans-top")) {
159 if (wikiFunction.args.size() >= 2) {
160 sense = wikiFunction.args.get(1);
161 //System.out.println("Sense: " + sense);
163 } else if (wikiFunction.name.equals("trans-bottom")) {
165 } else if (wikiFunction.name.equals("trans-mid")) {
166 } else if (wikiFunction.name.equals("trans-see")) {
167 } else if (wikiFunction.name.startsWith("checktrans")) {
170 System.err.println("Unexpected translation wikifunction: " + line + ", title=" + title);
172 line = wikiFunction.replaceWith(line, "");
176 } else if (line.startsWith("*")) {
177 // This line could produce an output...
179 // First strip the language and check whether it matches.
180 // And hold onto it for sub-lines.
181 final int colonIndex = line.indexOf(":");
182 if (colonIndex == -1) {
185 final String lang = line.substring(0, colonIndex);
186 if (!this.langPattern.matcher(lang).find()) {
190 String rest = line.substring(colonIndex + 1);
191 final StringBuilder lineText = new StringBuilder();
193 boolean ttbc = false;
194 WikiFunction wikiFunction;
195 while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
196 if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
197 if (wikiFunction.args.size() < 2) {
198 System.err.println("{{t}} with too few args: " + line + ", title=" + title);
201 final String langCode = wikiFunction.getArg(0);
202 if (this.langCodePattern.matcher(langCode).matches()) {
203 final String word = wikiFunction.getArg(1);
204 final String gender = wikiFunction.getArg(2);
205 final String transliteration = wikiFunction.getNamedArg("tr");
207 } else if (wikiFunction.name.equals("qualifier")) {
208 String qualifier = wikiFunction.getArg(0);
209 } else if (encodings.contains(wikiFunction.name)) {
210 rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
212 } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
213 String gender = wikiFunction.name;
214 for (int i = 0; i < wikiFunction.args.size(); ++i) {
215 gender += "|" + wikiFunction.getArg(i);
217 rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
219 } else if (wikiFunction.name.equals("g")) {
220 rest = wikiFunction.replaceWith(rest, "{g}");
222 } else if (wikiFunction.name.equals("l")) {
223 // encodes text in various langs.
224 rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(1));
225 // TODO: transliteration
227 } else if (wikiFunction.name.equals("term")) {
228 // cross-reference to another dictionary
229 rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
230 // TODO: transliteration
232 } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
233 // TODO: put this text aside to use it.
234 rest = wikiFunction.replaceWith(rest, "[" + wikiFunction.getArg(0) + "]");
236 } else if (wikiFunction.name.equals("ttbc")) {
238 } else if (wikiFunction.name.equals("trreq")) {
239 } else if (wikiFunction.name.equals("not used")) {
240 rest = wikiFunction.replaceWith(rest, "[not used]");
242 } else if (wikiFunction.name.equals("t-image")) {
243 // American sign language
244 } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
245 rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
248 System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
250 if (wikiFunction != null) {
251 rest = wikiFunction.replaceWith(rest, "");
254 } else if (line.equals("")) {
255 } else if (line.startsWith(":")) {
256 } else if (line.startsWith("[[") && line.endsWith("]]")) {
257 } else if (line.startsWith("''See''")) {
258 } else if (line.startsWith("''")) {
259 } else if (line.equals("----")) {
261 System.err.println("Unexpected translation line: " + line + ", title=" + title);
268 // -------------------------------------------------------------------------
270 private void doForeignWord(String title, String text) {
271 final WikiLineReader wikiLineReader = new WikiLineReader(text);
273 while ((line = wikiLineReader.readLine()) != null) {
274 final WikiHeading wikiHeading = WikiHeading.getHeading(line);
275 if (wikiHeading != null) {
277 if (wikiHeading.name.equals("Translations")) {
278 System.err.println("Translations not in English section: " + title);
279 } else if (wikiHeading.name.equals("Pronunciation")) {
280 //doPronunciation(wikiLineReader);
281 } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {