1 package com.hughes.android.dictionary.parser;
3 import java.util.ArrayList;
4 import java.util.LinkedHashMap;
5 import java.util.LinkedHashSet;
9 import java.util.regex.Pattern;
11 import com.hughes.android.dictionary.engine.DictionaryBuilder;
12 import com.hughes.android.dictionary.engine.EntryData;
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.IndexBuilder;
15 import com.hughes.android.dictionary.engine.PairEntry;
16 import com.hughes.android.dictionary.engine.PairEntry.Pair;
17 import com.hughes.util.ListUtil;
19 public class WikiWord {
27 final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
28 StringBuilder currentPronunciation = null;
30 final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
32 public WikiWord(final String title, int depth) {
33 this.title = title.intern();
37 static class PartOfSpeech {
41 final List<Meaning> meanings = new ArrayList<WikiWord.Meaning>();
43 final List<TranslationSense> translationSenses = new ArrayList<WikiWord.TranslationSense>();
45 final List<FormOf> formOfs = new ArrayList<WikiWord.FormOf>();
47 public PartOfSpeech(final int depth, String name) {
49 this.name = name.intern();
52 public Meaning newMeaning() {
53 final Meaning meaning = new Meaning();
54 meanings.add(meaning);
58 public Meaning lastMeaning() {
59 return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings);
63 static class TranslationSense {
65 List<List<Translation>> translations = new ArrayList<List<Translation>>();
67 translations.add(new ArrayList<Translation>());
68 translations.add(new ArrayList<Translation>());
72 static class Translation {
76 public Translation(final String language, final String text) {
77 this.language = language;
82 public String toString() {
83 return language + ": " + text;
88 final String grammarForm;
91 public FormOf(final String grammarForm, final String token) {
92 this.grammarForm = grammarForm;
97 static class Meaning {
99 final List<Example> examples = new ArrayList<WikiWord.Example>();
101 public Example newExample() {
102 final Example example = new Example();
103 this.examples.add(example);
107 public Example lastExample() {
108 return examples.isEmpty() ? newExample() : ListUtil.getLast(examples);
112 static class Example {
114 final StringBuilder example = new StringBuilder();
115 final StringBuilder exampleInEnglish = new StringBuilder();
118 // -------------------------------------------------------------------------
120 void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) {
121 //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation);
122 if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) {
123 System.err.println("Word with no POS: " + title);
125 for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
126 partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
131 static final Pattern templateName = Pattern.compile("\\{[^,]*,");
132 private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder,
133 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
134 //System.out.println(" pos: " + partOfSpeech.name);
136 for (final WikiWord.Meaning meaning : partOfSpeech.meanings) {
137 //System.out.println(" meaning: " + meaning.meaning);
138 for (final WikiWord.Example example : meaning.examples) {
139 if (example.example.length() > 0) {
140 //System.out.println(" example: " + example.example);
142 if (example.exampleInEnglish.length() > 0) {
143 //System.out.println(" exampleInEnglish: " + example.exampleInEnglish);
150 final boolean formOfSwap = index != 0;
151 for (final FormOf formOf : partOfSpeech.formOfs) {
152 final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
153 final PairEntry pairEntry = new PairEntry(new Pair[] {pair});
154 final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
155 dictBuilder.dictionary.pairEntries.add(pairEntry);
157 // File under title token.
158 final Set<String> tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR);
159 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF);
164 if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) {
165 final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")";
166 final boolean swap = enIndexBuilder == 1;
169 for (final Meaning meaning : partOfSpeech.meanings) {
170 final List<Pair> pairs = new ArrayList<PairEntry.Pair>();
172 final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
173 exampleTokens.add(new LinkedHashSet<String>());
174 exampleTokens.add(new LinkedHashSet<String>());
176 if (meaning.meaning != null && meaning.meaning.length() > 0) {
177 final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap);
178 pairs.add(meaningPair);
180 System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name);
184 for (final Example example : meaning.examples) {
185 final int dashIndex = example.example.indexOf("—");
186 if (example.exampleInEnglish.length() == 0 && dashIndex != -1) {
187 System.out.println("Splitting example: title=" + title + ", "+ example.example);
188 example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim());
189 example.example.delete(dashIndex, example.example.length());
192 if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) {
193 final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap);
196 for (int i = 0; i < 2; ++i) {
197 exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR));
202 // Create EntryData with the PairEntry.
203 final PairEntry pairEntry = new PairEntry(pairs.toArray(new Pair[0]));
204 final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
205 dictBuilder.dictionary.pairEntries.add(pairEntry);
207 // File under title token.
208 final Set<String> titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
209 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD);
211 // File under the meaning tokens (English):
212 if (meaning.meaning != null) {
213 // If the meaning contains any templates, strip out the template name
214 // so we don't index it.
215 final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll("");
216 final Set<String> meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR);
217 dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD);
220 // File under other tokens that we saw.
221 for (int i = 0; i < 2; ++i) {
222 dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS);
231 translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
235 private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder,
236 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
237 if (!partOfSpeech.translationSenses.isEmpty()) {
238 if (!language.equals("English")) {
239 System.err.println("Translation sections not in English.");
242 final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
244 final StringBuilder englishPron = new StringBuilder();
245 for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
246 englishPron.append("\n");
247 if (accentToPron.getKey().length() > 0) {
248 englishPron.append(accentToPron.getKey()).append(": ");
250 englishPron.append(accentToPron.getValue());
253 for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
254 //System.out.println(" sense: " + translationSense.sense);
255 if (translationSense.sense == null) {
256 //System.err.println(" null sense: " + title);
258 String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
259 englishSense += englishPron.toString();
261 final StringBuilder[] sideBuilders = new StringBuilder[2];
262 final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
263 for (int i = 0; i < 2; ++i) {
264 sideBuilders[i] = new StringBuilder();
265 sideTokens.add(new LinkedHashMap<EntryTypeName, List<String>>());
268 if (enIndexBuilder != -1) {
269 sideBuilders[enIndexBuilder].append(englishSense);
270 addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD);
273 // Get the entries from the translation section.
274 for (int i = 0; i < 2; ++i) {
275 //System.out.println(" lang: " + i);
276 for (final Translation translation : translationSense.translations.get(i)) {
277 //System.out.println(" translation: " + translation);
278 sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : "");
279 if (translationSense.translations.get(i).size() > 1) {
280 sideBuilders[i].append(translation.language).append(": ");
282 sideBuilders[i].append(translation.text);
284 // TODO: Don't index {m}, {f}
285 // TODO: Don't even show: (1), (1-2), etc.
286 addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD);
290 // Construct the Translations-based QuickDic entry for this TranslationSense.
291 if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
292 final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
293 final PairEntry pairEntry = new PairEntry(new Pair[] { pair });
294 final EntryData entryData = new EntryData(dictBuilder.dictionary.pairEntries.size(), pairEntry);
295 dictBuilder.dictionary.pairEntries.add(pairEntry);
297 // Add the EntryData to the indices under the correct tokens.
298 for (int i = 0; i < 2; ++i) {
299 final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i);
300 for (final Map.Entry<EntryTypeName, List<String>> entry : sideTokens.get(i).entrySet()) {
301 for (final String token : entry.getValue()) {
302 final List<EntryData> entries = indexBuilder.getOrCreateEntries(token, entry.getKey());
303 entries.add(entryData);
315 static void addTokens(final String text, final Map<EntryTypeName, List<String>> map,
316 EntryTypeName entryTypeName) {
317 final Set<String> tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR);
318 if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) {
319 entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD;
321 List<String> tokenList = map.get(entryTypeName);
322 if (tokenList == null) {
323 tokenList = new ArrayList<String>();
324 map.put(entryTypeName, tokenList);
326 tokenList.addAll(tokens);