X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FEnWiktionaryXmlParser.java;h=554cd0062c2626ac8bbb7770671d4bb42184cdc2;hb=aab26458fc0a45d2a59d69cb6932090aca3b044f;hp=f985084e9b031b6e8dee8a88f69042cfb2f50b3b;hpb=203c2f927c16a1f4629e2c9504a5a3031a0f130d;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index f985084..554cd00 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -38,6 +38,8 @@ import com.hughes.android.dictionary.engine.PairEntry.Pair; public class EnWiktionaryXmlParser { + private static final String TRANSLITERATION_FORMAT = " (tr. %s)"; + static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName()); // TODO: process {{ttbc}} lines @@ -56,14 +58,14 @@ public class EnWiktionaryXmlParser { "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); final IndexBuilder enIndexBuilder; - final IndexBuilder otherIndexBuilder; + final IndexBuilder foreignIndexBuilder; final Pattern langPattern; final Pattern langCodePattern; final boolean swap; public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { this.enIndexBuilder = enIndexBuilder; - this.otherIndexBuilder = otherIndexBuilder; + this.foreignIndexBuilder = otherIndexBuilder; this.langPattern = langPattern; this.langCodePattern = langCodePattern; this.swap = swap; @@ -117,8 +119,8 @@ public class EnWiktionaryXmlParser { heading = heading.replaceAll("=", "").trim(); if (heading.equals("English")) { doEnglishWord(title, text); - } else if (langPattern.matcher(heading).matches()){ - doForeignWord(title, text); + } else if (langPattern.matcher(heading).find()){ + doForeignWord(heading, title, text); } } // endPage() @@ -144,6 +146,8 @@ public class EnWiktionaryXmlParser { if (partOfSpeechHeader.matcher(headerName).matches()) { posDepth = wikiTokenizer.headingDepth(); pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + } else if (headerName.equals("Translations")) { if (pos == null) { LOG.warning("Translations without POS: " + title); @@ -270,29 +274,101 @@ public class EnWiktionaryXmlParser { } } + private static T get(final List list, final int index, final T defaultValue) { + return index < list.size() ? list.get(index) : defaultValue; + } + private static T get(final List list, final int index) { - return index < list.size() ? list.get(index) : null; + return get(list, index, null); } + + private static T remove(final List list, final int index, final T defaultValue) { + return index < list.size() ? list.remove(index) : defaultValue; + } + + static final class Callback implements WikiTokenizer.Callback { + public Callback(IndexedEntry indexedEntry, IndexBuilder defaultIndexBuilder, + StringBuilder builder, Map functionCallbacks) { + this.indexedEntry = indexedEntry; + this.defaultIndexBuilder = defaultIndexBuilder; + this.builder = builder; + this.functionCallbacks = functionCallbacks; + } + + final IndexedEntry indexedEntry; + final IndexBuilder defaultIndexBuilder; + final StringBuilder builder; + final Map functionCallbacks; + + // TODO: the classes of text are wrong.... + + @Override + public void onPlainText(WikiTokenizer wikiTokenizer) { + final String plainText = wikiTokenizer.token(); + builder.append(plainText); + defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String plainText = wikiTokenizer.wikiLinkText(); + builder.append(plainText); + // TODO: should check for English before appending. + defaultIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); + } + + @Override + public void onFunction(String functionName, + List functionPositionArgs, Map functionNamedArgs) { + } + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onComment(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + assert false; + } + } + private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - final StringBuilder otherText = new StringBuilder(); + final StringBuilder foreignText = new StringBuilder(); final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false); while (wikiTokenizer.nextToken() != null) { if (wikiTokenizer.isPlainText()) { final String plainText = wikiTokenizer.token(); - otherText.append("").append(plainText); - otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + foreignText.append(plainText); + foreignIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); } else if (wikiTokenizer.isWikiLink()) { final String plainText = wikiTokenizer.wikiLinkText(); - otherText.append("").append(plainText); - otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); - + foreignText.append(plainText); + // TODO: should check for English before appending. + foreignIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT); + } else if (wikiTokenizer.isFunction()) { final String functionName = wikiTokenizer.functionName(); final List args = wikiTokenizer.functionPositionArgs(); @@ -304,83 +380,81 @@ public class EnWiktionaryXmlParser { continue; } final String langCode = get(args, 0); - //if (this.langCodePattern.matcher(langCode).matches()) { - final String word = get(args, 1); - final String gender = get(args, 2); - final String transliteration = namedArgs.get("tr"); - if (otherText.length() > 0) { - otherText.append(""); - } - otherText.append(word); - otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - if (gender != null) { - otherText.append(String.format(" {%s}", gender)); - } - if (transliteration != null) { - otherText.append(String.format(" (tr. %s)", transliteration)); - otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); - } - //} + final String word = get(args, 1); + final String gender = get(args, 2); + final String transliteration = namedArgs.get("tr"); + if (foreignText.length() > 0) { + foreignText.append(""); + } + foreignText.append(word); + foreignIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + if (gender != null) { + foreignText.append(String.format(" {%s}", gender)); + } + if (transliteration != null) { + foreignText.append(String.format(TRANSLITERATION_FORMAT, transliteration)); + foreignIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + } } else if (functionName.equals("qualifier")) { if (args.size() == 0) { - otherText.append(wikiTokenizer.token()); + foreignText.append(wikiTokenizer.token()); } else { String qualifier = args.get(0); if (!namedArgs.isEmpty() || args.size() > 1) { LOG.warning("weird qualifier: " + line); } // Unindexed! - otherText.append("(").append(qualifier).append(")"); + foreignText.append("(").append(qualifier).append(")"); } } else if (encodings.contains(functionName)) { - otherText.append("").append(args.get(0)); - otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + foreignText.append("").append(args.get(0)); + foreignIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); } else if (isGender(functionName)) { - appendGender(otherText, functionName, args); + appendGender(foreignText, functionName, args); } else if (functionName.equals("g")) { - otherText.append("{g}"); + foreignText.append("{g}"); } else if (functionName.equals("l")) { // encodes text in various langs. // lang is arg 0. - otherText.append("").append(args.get(1)); - otherIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + foreignText.append("").append(args.get(1)); + foreignIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); // TODO: transliteration } else if (functionName.equals("term")) { // cross-reference to another dictionary - otherText.append("").append(args.get(0)); - otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + foreignText.append("").append(args.get(0)); + foreignIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); // TODO: transliteration } else if (functionName.equals("italbrac") || functionName.equals("gloss")) { // TODO: put this text aside to use it. - otherText.append("[").append(args.get(0)).append("]"); - otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + foreignText.append("[").append(args.get(0)).append("]"); + foreignIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); } else if (functionName.equals("ttbc")) { LOG.warning("Unexpected {{ttbc}}"); } else if (functionName.equals("trreq")) { } else if (functionName.equals("not used")) { - otherText.append("(not used)"); + foreignText.append("(not used)"); } else if (functionName.equals("t-image")) { // American sign language } else { // Unindexed! - otherText.append(wikiTokenizer.token()); + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + WikiTokenizer.appendFunction(foreignText.append("{{"), functionName, args, namedArgs).append("}}"); } } else if (wikiTokenizer.isNewline()) { - assert false; } else if (wikiTokenizer.isComment()) { } else if (wikiTokenizer.isMarkup()) { } else { LOG.warning("Bad translation token: " + wikiTokenizer.token()); } } - if (otherText.length() == 0) { - LOG.warning("Empty otherText: " + line); + if (foreignText.length() == 0) { + LOG.warning("Empty foreignText: " + line); return; } if (lang != null) { - otherText.insert(0, "(" + lang + ") "); + foreignText.insert(0, String.format("(%s) ", lang)); } StringBuilder englishText = new StringBuilder(); @@ -395,7 +469,7 @@ public class EnWiktionaryXmlParser { } enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap); + final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); pairEntry.pairs.add(pair); if (!pairsAdded.add(pair.toString())) { LOG.warning("Duplicate pair: " + pair.toString()); @@ -426,7 +500,7 @@ public class EnWiktionaryXmlParser { // ------------------------------------------------------------------------- - private void doForeignWord(final String title, final String text) { + private void doForeignWord(final String lang, final String title, final String text) { final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); while (wikiTokenizer.nextToken() != null) { if (wikiTokenizer.isHeading()) { @@ -436,7 +510,7 @@ public class EnWiktionaryXmlParser { } else if (headingName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } else if (partOfSpeechHeader.matcher(headingName).matches()) { - doForeignPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + doForeignPartOfSpeech(lang, title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); } } else { } @@ -462,14 +536,15 @@ public class EnWiktionaryXmlParser { int foreignCount = 0; - private void doForeignPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + private void doForeignPartOfSpeech(final String lang, String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { if (++foreignCount % 1000 == 0) { - LOG.info("***" + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); + LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); } if (title.equals("moro")) { System.out.println(); } + boolean titleAppended = false; final StringBuilder foreignBuilder = new StringBuilder(); final Collection wordForms = new ArrayList(); final List listSections = new ArrayList(); @@ -511,19 +586,84 @@ public class EnWiktionaryXmlParser { // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) // for the conjugation table from "fa". // Would like to be able to link to a lang#token. - if (isGender(name)) { - appendGender(foreignBuilder, name, args); - } else if (name.equals("wikipedia")) { - namedArgs.remove("lang"); - if (args.size() > 1 || !namedArgs.isEmpty()) { - // Unindexed! - foreignBuilder.append(wikiTokenizer.token()); - } else if (args.size() == 1) { - foreignBuilder.append(wikiTokenizer.token()); - } else { - //foreignBuilder.append(title); - } - } else if (name.equals("it-noun")) { + if (isGender(name)) { + appendGender(foreignBuilder, name, args); + } else if (name.equals("wikipedia")) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (args.size() == 1) { + foreignBuilder.append(wikiTokenizer.token()); + } else { + //foreignBuilder.append(title); + } + } else if (name.equals("attention") || name.equals("zh-attention")) { + // See: http://en.wiktionary.org/wiki/Template:attention + // Ignore these. + // TODO: head } else if (name.equals("head")) { + } else if (name.equals("infl")) { + // See: http://en.wiktionary.org/wiki/Template:infl + final String langCode = get(args, 0); + String head = namedArgs.remove("head"); + if (head == null) { + head = namedArgs.remove("title"); // Bug + } + if (head == null) { + head = title; + } else { + head = WikiTokenizer.toPlainText(head); + } + titleAppended = true; + + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + + final String tr = namedArgs.remove("tr"); + String g = namedArgs.remove("g"); + if (g == null) { + g = namedArgs.remove("gender"); + } + final String g2 = namedArgs.remove("g2"); + final String g3 = namedArgs.remove("g3"); + + foreignBuilder.append(head); + + if (g != null) { + foreignBuilder.append(" {").append(g); + if (g2 != null) { + foreignBuilder.append("|").append(g2); + } + if (g3 != null) { + foreignBuilder.append("|").append(g3); + } + foreignBuilder.append("}"); + } + + if (tr != null) { + foreignBuilder.append(String.format(TRANSLITERATION_FORMAT, tr)); + wordForms.add(tr); + } + + final String pos = get(args, 1); + if (pos != null) { + foreignBuilder.append(" (").append(pos).append(")"); + } + for (int i = 2; i < args.size(); i += 2) { + final String inflName = get(args, i); + final String inflValue = get(args, i + 1); + foreignBuilder.append(", ").append(WikiTokenizer.toPlainText(inflName)); + if (inflValue != null && inflValue.length() > 0) { + foreignBuilder.append(": ").append(WikiTokenizer.toPlainText(inflValue)); + wordForms.add(inflValue); + } + } + for (final String key : namedArgs.keySet()) { + final String value = WikiTokenizer.toPlainText(namedArgs.get(key)); + foreignBuilder.append(" ").append(key).append("=").append(value); + wordForms.add(value); + } + } else if (name.equals("it-noun")) { + titleAppended = true; final String base = get(args, 0); final String gender = get(args, 1); final String singular = base + get(args, 2); @@ -531,6 +671,9 @@ public class EnWiktionaryXmlParser { foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); wordForms.add(singular); wordForms.add(plural); + if (!namedArgs.isEmpty() || args.size() > 4) { + LOG.warning("Invalid it-noun: " + wikiTokenizer.token()); + } } else if (name.equals("it-proper noun")) { foreignBuilder.append(wikiTokenizer.token()); } else if (name.equals("it-adj")) { @@ -548,7 +691,6 @@ public class EnWiktionaryXmlParser { foreignBuilder.append(wikiTokenizer.token()); // LOG.warning("Unknown function: " + wikiTokenizer.token()); } - } else if (wikiTokenizer.isListItem()) { final String prefix = wikiTokenizer.listItemPrefix(); if (lastListSection != null && @@ -581,8 +723,11 @@ public class EnWiktionaryXmlParser { // Here's where we exit. // Should we make an entry even if there are no foreign list items? String foreign = foreignBuilder.toString().trim(); - if (!foreign.toLowerCase().startsWith(title.toLowerCase())) { - foreign = title + " " + foreign; + if (!titleAppended && !foreign.toLowerCase().startsWith(title.toLowerCase())) { + foreign = String.format("%s %s", title, foreign); + } + if (!langPattern.matcher(lang).matches()) { + foreign = String.format("(%s) %s", lang, foreign); } for (final ListSection listSection : listSections) { doForeignListItem(foreign, title, wordForms, listSection); @@ -597,6 +742,8 @@ public class EnWiktionaryXmlParser { "imperative" ); + // Might only want to remove "lang" if it's equal to "zh", for example. + static final Set USELESS_WIKI_ARGS = new LinkedHashSet(Arrays.asList("lang", "sc", "sort", "cat")); private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { @@ -613,7 +760,6 @@ public class EnWiktionaryXmlParser { final StringBuilder englishBuilder = new StringBuilder(); final String mainLine = listSection.firstLine; - final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); while (englishTokenizer.nextToken() != null) { // TODO handle form of.... @@ -629,7 +775,7 @@ public class EnWiktionaryXmlParser { enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); } else if (link.contains("#") && this.langPattern.matcher(link).find()) { englishBuilder.append(text); - otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + foreignIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); } else if (link.equals("plural")) { englishBuilder.append(text); } else { @@ -646,20 +792,59 @@ public class EnWiktionaryXmlParser { } } else if (englishTokenizer.isFunction()) { final String name = englishTokenizer.functionName(); - if (name.contains("conjugation of ") || - name.contains("form of ") || - name.contains("feminine of ") || - name.contains("plural of ")) { - // Ignore these in the index, they're really annoying.... - englishBuilder.append(englishTokenizer.token()); + final List args = englishTokenizer.functionPositionArgs(); + final Map namedArgs = englishTokenizer.functionNamedArgs(); + + if ( + name.equals("form of") || + name.contains("conjugation of") || + name.contains("participle of") || + name.contains("gerund of") || + name.contains("feminine of") || + name.contains("plural of")) { + String formName = name; + if (name.equals("form of")) { + formName = remove(args, 0, null); + } + if (formName == null) { + LOG.warning("Missing form name: " + title); + formName = "form of"; + } + String baseForm = get(args, 1, ""); + if ("".equals(baseForm)) { + baseForm = get(args, 0, null); + remove(args, 1, ""); + } else { + remove(args, 0, null); + } + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + WikiTokenizer.appendFunction(englishBuilder.append("{"), formName, args, namedArgs).append("}"); + if (baseForm != null) { + foreignIndexBuilder.addEntryWithString(indexedEntry, baseForm, EntryTypeName.WIKTIONARY_BASE_FORM_SINGLE, EntryTypeName.WIKTIONARY_BASE_FORM_MULTI); + } else { + // null baseForm happens in Danish. + LOG.warning("Null baseform: " + title); + } +// } else if (name.equals("defn")) { + // TODO: test me! + // Do nothing. + // http://en.wiktionary.org/wiki/Wiktionary:Requests_for_deletion/Others#Template:defn + // Redundant, used for the same purpose as {{rfdef}}, but this + // doesn't produce the "This word needs a definition" text. + // Delete or redirect. } else { - englishBuilder.append(englishTokenizer.token()); + namedArgs.keySet().removeAll(USELESS_WIKI_ARGS); + if (args.size() == 0 && namedArgs.isEmpty()) { + englishBuilder.append("{").append(name).append("}"); + } else { + WikiTokenizer.appendFunction(englishBuilder.append("{{"), name, args, namedArgs).append("}}"); + } // LOG.warning("Unexpected function: " + englishTokenizer.token()); } } else { if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { } else { - LOG.warning("Unexpected definition text: " + englishTokenizer.token()); + LOG.warning("Unexpected definition type: " + englishTokenizer.token()); } } } @@ -668,9 +853,9 @@ public class EnWiktionaryXmlParser { if (english.length() > 0) { final Pair pair = new Pair(english, trim(foreignText), this.swap); pairEntry.pairs.add(pair); - otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + foreignIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); for (final String form : forms) { - otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); + foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTD_FORM_SINGLE, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI); } } @@ -693,7 +878,7 @@ public class EnWiktionaryXmlParser { if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) { final String foreignEx = nextLine.substring(0, dash); final String englishEx = nextLine.substring(dash + mdashLen); - final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, otherIndexBuilder, indexedEntry), swap); + final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, foreignIndexBuilder, indexedEntry), swap); if (pair.lang1 != "--" && pair.lang1 != "--") { pairEntry.pairs.add(pair); } @@ -707,25 +892,28 @@ public class EnWiktionaryXmlParser { } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { if (lastForeign != null && pairEntry.pairs.size() > 0) { pairEntry.pairs.remove(pairEntry.pairs.size() - 1); - final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { + final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, foreignIndexBuilder, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } lastForeign = null; } else { LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); - // TODO: add something. + final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); + if (pair.lang1 != "--" || pair.lang2 != "--") { + pairEntry.pairs.add(pair); + } } } else if (nextPrefix.equals("#*")) { // Can't really index these. final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); lastForeign = nextLine; - if (pair.lang1 != "--" && pair.lang1 != "--") { + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) { final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap); - if (pair.lang1 != "--" && pair.lang1 != "--") { + if (pair.lang1 != "--" || pair.lang2 != "--") { pairEntry.pairs.add(pair); } // } else {