1 package com.hughes.android.dictionary.parser;
3 import java.util.ArrayList;
4 import java.util.LinkedHashMap;
5 import java.util.LinkedHashSet;
9 import java.util.regex.Pattern;
11 import com.hughes.android.dictionary.engine.DictionaryBuilder;
12 import com.hughes.android.dictionary.engine.IndexedEntry;
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.IndexBuilder;
15 import com.hughes.android.dictionary.engine.PairEntry;
16 import com.hughes.android.dictionary.engine.PairEntry.Pair;
17 import com.hughes.util.ListUtil;
19 public class WikiWord {
27 final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
28 StringBuilder currentPronunciation = null;
30 final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
32 public WikiWord(final String title, int depth) {
33 this.title = title.intern();
37 static class PartOfSpeech {
41 final List<Meaning> meanings = new ArrayList<WikiWord.Meaning>();
43 final List<TranslationSense> translationSenses = new ArrayList<WikiWord.TranslationSense>();
45 final List<FormOf> formOfs = new ArrayList<WikiWord.FormOf>();
47 public PartOfSpeech(final int depth, String name) {
49 this.name = name.intern();
52 public Meaning newMeaning() {
53 final Meaning meaning = new Meaning();
54 meanings.add(meaning);
58 public Meaning lastMeaning() {
59 return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings);
63 static class TranslationSense {
65 List<List<Translation>> translations = new ArrayList<List<Translation>>();
67 translations.add(new ArrayList<Translation>());
68 translations.add(new ArrayList<Translation>());
72 static class Translation {
76 public Translation(final String language, final String text) {
77 this.language = language;
82 public String toString() {
83 return language + ": " + text;
88 final String grammarForm;
91 public FormOf(final String grammarForm, final String token) {
92 this.grammarForm = grammarForm;
97 static class Meaning {
99 final List<Example> examples = new ArrayList<WikiWord.Example>();
101 public Example newExample() {
102 final Example example = new Example();
103 this.examples.add(example);
107 public Example lastExample() {
108 return examples.isEmpty() ? newExample() : ListUtil.getLast(examples);
112 static class Example {
114 final StringBuilder example = new StringBuilder();
115 final StringBuilder exampleInEnglish = new StringBuilder();
118 // -------------------------------------------------------------------------
120 void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) {
121 //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation);
122 if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) {
123 System.err.println("Word with no POS: " + title);
125 for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
126 partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
131 final PairEntry pronEntry = new PairEntry();
132 for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
133 String accent = accentToPron.getKey();
134 if (accent.length() > 0) {
135 accent = accent + ": ";
137 pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0));
139 if (pronEntry.pairs.size() > 0) {
140 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry);
141 dictBuilder.dictionary.pairEntries.add(pronEntry);
142 final Set<String> tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
143 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION);
149 static final Pattern templateName = Pattern.compile("\\{[^,]*,");
150 private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder,
151 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
152 //System.out.println(" pos: " + partOfSpeech.name);
154 for (final WikiWord.Meaning meaning : partOfSpeech.meanings) {
155 //System.out.println(" meaning: " + meaning.meaning);
156 for (final WikiWord.Example example : meaning.examples) {
157 if (example.example.length() > 0) {
158 //System.out.println(" example: " + example.example);
160 if (example.exampleInEnglish.length() > 0) {
161 //System.out.println(" exampleInEnglish: " + example.exampleInEnglish);
167 final boolean formOfSwap = index != 0;
168 for (final FormOf formOf : partOfSpeech.formOfs) {
169 final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
170 final PairEntry pairEntry = new PairEntry();
171 pairEntry.pairs.add(pair);
172 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
173 dictBuilder.dictionary.pairEntries.add(pairEntry);
175 // File under title token.
176 final Set<String> tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR);
177 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF);
182 if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) {
183 final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")";
184 final boolean swap = enIndexBuilder == 1;
187 for (final Meaning meaning : partOfSpeech.meanings) {
188 final PairEntry pairEntry = new PairEntry();
189 final List<Pair> pairs = pairEntry.pairs;
191 final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
192 exampleTokens.add(new LinkedHashSet<String>());
193 exampleTokens.add(new LinkedHashSet<String>());
195 if (meaning.meaning != null && meaning.meaning.length() > 0) {
196 final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap);
197 pairs.add(meaningPair);
199 System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name);
203 for (final Example example : meaning.examples) {
204 final int dashIndex = example.example.indexOf("—");
205 if (example.exampleInEnglish.length() == 0 && dashIndex != -1) {
206 System.out.println("Splitting example: title=" + title + ", "+ example.example);
207 example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim());
208 example.example.delete(dashIndex, example.example.length());
211 if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) {
212 final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap);
215 for (int i = 0; i < 2; ++i) {
216 exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR));
221 // Create EntryData with the PairEntry.
222 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
223 dictBuilder.dictionary.pairEntries.add(pairEntry);
225 // File under title token.
226 final Set<String> titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
227 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD);
229 // File under the meaning tokens (English):
230 if (meaning.meaning != null) {
231 // If the meaning contains any templates, strip out the template name
232 // so we don't index it.
233 final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll("");
234 final Set<String> meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR);
235 dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD);
238 // File under other tokens that we saw.
239 for (int i = 0; i < 2; ++i) {
240 dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS);
248 translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
252 private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder,
253 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
254 if (!partOfSpeech.translationSenses.isEmpty()) {
255 if (!language.equals("English")) {
256 System.err.println("Translation sections not in English.");
259 final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
261 for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
262 //System.out.println(" sense: " + translationSense.sense);
263 if (translationSense.sense == null) {
264 //System.err.println(" null sense: " + title);
266 String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
268 final StringBuilder[] sideBuilders = new StringBuilder[2];
269 final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
270 for (int i = 0; i < 2; ++i) {
271 sideBuilders[i] = new StringBuilder();
272 sideTokens.add(new LinkedHashMap<EntryTypeName, List<String>>());
275 if (enIndexBuilder != -1) {
276 sideBuilders[enIndexBuilder].append(englishSense);
277 addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD);
280 // Get the entries from the translation section.
281 for (int i = 0; i < 2; ++i) {
282 //System.out.println(" lang: " + i);
283 for (final Translation translation : translationSense.translations.get(i)) {
284 //System.out.println(" translation: " + translation);
285 sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : "");
286 if (translationSense.translations.get(i).size() > 1) {
287 sideBuilders[i].append(translation.language).append(": ");
289 sideBuilders[i].append(translation.text);
291 // TODO: Don't index {m}, {f}
292 // TODO: Don't even show: (1), (1-2), etc.
293 addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD);
297 // Construct the Translations-based QuickDic entry for this TranslationSense.
298 if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
299 final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
300 final PairEntry pairEntry = new PairEntry();
301 pairEntry.pairs.add(pair);
302 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
303 dictBuilder.dictionary.pairEntries.add(pairEntry);
305 // Add the EntryData to the indices under the correct tokens.
306 for (int i = 0; i < 2; ++i) {
307 final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i);
308 for (final Map.Entry<EntryTypeName, List<String>> entry : sideTokens.get(i).entrySet()) {
309 for (final String token : entry.getValue()) {
310 final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, entry.getKey());
311 entries.add(entryData);
323 static void addTokens(final String text, final Map<EntryTypeName, List<String>> map,
324 EntryTypeName entryTypeName) {
325 final Set<String> tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR);
326 if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) {
327 entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD;
329 List<String> tokenList = map.get(entryTypeName);
330 if (tokenList == null) {
331 tokenList = new ArrayList<String>();
332 map.put(entryTypeName, tokenList);
334 tokenList.addAll(tokens);