1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.LinkedHashMap;
19 import java.util.LinkedHashSet;
20 import java.util.List;
23 import java.util.regex.Pattern;
25 import com.hughes.android.dictionary.engine.DictionaryBuilder;
26 import com.hughes.android.dictionary.engine.IndexedEntry;
27 import com.hughes.android.dictionary.engine.EntryTypeName;
28 import com.hughes.android.dictionary.engine.IndexBuilder;
29 import com.hughes.android.dictionary.engine.PairEntry;
30 import com.hughes.android.dictionary.engine.PairEntry.Pair;
31 import com.hughes.util.ListUtil;
33 public class WikiWord {
41 final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
42 StringBuilder currentPronunciation = null;
44 final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
46 public WikiWord(final String title, int depth) {
47 this.title = title.intern();
51 static class PartOfSpeech {
55 final List<Meaning> meanings = new ArrayList<WikiWord.Meaning>();
57 final List<TranslationSense> translationSenses = new ArrayList<WikiWord.TranslationSense>();
59 final List<FormOf> formOfs = new ArrayList<WikiWord.FormOf>();
61 public PartOfSpeech(final int depth, String name) {
63 this.name = name.intern();
66 public Meaning newMeaning() {
67 final Meaning meaning = new Meaning();
68 meanings.add(meaning);
72 public Meaning lastMeaning() {
73 return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings);
77 static class TranslationSense {
79 List<List<Translation>> translations = new ArrayList<List<Translation>>();
81 translations.add(new ArrayList<Translation>());
82 translations.add(new ArrayList<Translation>());
86 static class Translation {
90 public Translation(final String language, final String text) {
91 this.language = language;
96 public String toString() {
97 return language + ": " + text;
101 static class FormOf {
102 final String grammarForm;
105 public FormOf(final String grammarForm, final String token) {
106 this.grammarForm = grammarForm;
111 static class Meaning {
113 final List<Example> examples = new ArrayList<WikiWord.Example>();
115 public Example newExample() {
116 final Example example = new Example();
117 this.examples.add(example);
121 public Example lastExample() {
122 return examples.isEmpty() ? newExample() : ListUtil.getLast(examples);
126 static class Example {
128 final StringBuilder example = new StringBuilder();
129 final StringBuilder exampleInEnglish = new StringBuilder();
132 // -------------------------------------------------------------------------
134 void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) {
135 //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation);
136 if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) {
137 System.err.println("Word with no POS: " + title);
139 for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
140 partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
145 final PairEntry pronEntry = new PairEntry();
146 for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
147 String accent = accentToPron.getKey();
148 if (accent.length() > 0) {
149 accent = accent + ": ";
151 pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0));
153 if (pronEntry.pairs.size() > 0) {
154 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry);
155 dictBuilder.dictionary.pairEntries.add(pronEntry);
156 final Set<String> tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
157 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION);
163 static final Pattern templateName = Pattern.compile("\\{[^,]*,");
164 private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder,
165 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
166 //System.out.println(" pos: " + partOfSpeech.name);
168 for (final WikiWord.Meaning meaning : partOfSpeech.meanings) {
169 //System.out.println(" meaning: " + meaning.meaning);
170 for (final WikiWord.Example example : meaning.examples) {
171 if (example.example.length() > 0) {
172 //System.out.println(" example: " + example.example);
174 if (example.exampleInEnglish.length() > 0) {
175 //System.out.println(" exampleInEnglish: " + example.exampleInEnglish);
181 final boolean formOfSwap = index != 0;
182 for (final FormOf formOf : partOfSpeech.formOfs) {
183 final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
184 final PairEntry pairEntry = new PairEntry();
185 pairEntry.pairs.add(pair);
186 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
187 dictBuilder.dictionary.pairEntries.add(pairEntry);
189 // File under title token.
190 final Set<String> tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR);
191 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF);
196 if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) {
197 final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")";
198 final boolean swap = enIndexBuilder == 1;
201 for (final Meaning meaning : partOfSpeech.meanings) {
202 final PairEntry pairEntry = new PairEntry();
203 final List<Pair> pairs = pairEntry.pairs;
205 final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
206 exampleTokens.add(new LinkedHashSet<String>());
207 exampleTokens.add(new LinkedHashSet<String>());
209 if (meaning.meaning != null && meaning.meaning.length() > 0) {
210 final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap);
211 pairs.add(meaningPair);
213 System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name);
217 for (final Example example : meaning.examples) {
218 final int dashIndex = example.example.indexOf("—");
219 if (example.exampleInEnglish.length() == 0 && dashIndex != -1) {
220 System.out.println("Splitting example: title=" + title + ", "+ example.example);
221 example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim());
222 example.example.delete(dashIndex, example.example.length());
225 if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) {
226 final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap);
229 for (int i = 0; i < 2; ++i) {
230 exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR));
235 // Create EntryData with the PairEntry.
236 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
237 dictBuilder.dictionary.pairEntries.add(pairEntry);
239 // File under title token.
240 final Set<String> titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
241 dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD);
243 // File under the meaning tokens (English):
244 if (meaning.meaning != null) {
245 // If the meaning contains any templates, strip out the template name
246 // so we don't index it.
247 final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll("");
248 final Set<String> meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR);
249 dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD);
252 // File under other tokens that we saw.
253 for (int i = 0; i < 2; ++i) {
254 dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS);
262 translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
266 private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder,
267 final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
268 if (!partOfSpeech.translationSenses.isEmpty()) {
269 if (!language.equals("English")) {
270 System.err.println("Translation sections not in English.");
273 final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
275 for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
276 //System.out.println(" sense: " + translationSense.sense);
277 if (translationSense.sense == null) {
278 //System.err.println(" null sense: " + title);
280 String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
282 final StringBuilder[] sideBuilders = new StringBuilder[2];
283 final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
284 for (int i = 0; i < 2; ++i) {
285 sideBuilders[i] = new StringBuilder();
286 sideTokens.add(new LinkedHashMap<EntryTypeName, List<String>>());
289 if (enIndexBuilder != -1) {
290 sideBuilders[enIndexBuilder].append(englishSense);
291 addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD);
294 // Get the entries from the translation section.
295 for (int i = 0; i < 2; ++i) {
296 //System.out.println(" lang: " + i);
297 for (final Translation translation : translationSense.translations.get(i)) {
298 //System.out.println(" translation: " + translation);
299 sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : "");
300 if (translationSense.translations.get(i).size() > 1) {
301 sideBuilders[i].append(translation.language).append(": ");
303 sideBuilders[i].append(translation.text);
305 // TODO: Don't index {m}, {f}
306 // TODO: Don't even show: (1), (1-2), etc.
307 addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD);
311 // Construct the Translations-based QuickDic entry for this TranslationSense.
312 if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
313 final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
314 final PairEntry pairEntry = new PairEntry();
315 pairEntry.pairs.add(pair);
316 final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
317 dictBuilder.dictionary.pairEntries.add(pairEntry);
319 // Add the EntryData to the indices under the correct tokens.
320 for (int i = 0; i < 2; ++i) {
321 final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i);
322 for (final Map.Entry<EntryTypeName, List<String>> entry : sideTokens.get(i).entrySet()) {
323 for (final String token : entry.getValue()) {
324 final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, entry.getKey());
325 entries.add(entryData);
337 static void addTokens(final String text, final Map<EntryTypeName, List<String>> map,
338 EntryTypeName entryTypeName) {
339 final Set<String> tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR);
340 if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) {
341 entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD;
343 List<String> tokenList = map.get(entryTypeName);
344 if (tokenList == null) {
345 tokenList = new ArrayList<String>();
346 map.put(entryTypeName, tokenList);
348 tokenList.addAll(tokens);