X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FEnToTranslationParser.java;h=73b2db697ce86a8a92179b30baaaeefac38e5dea;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=6c1bbec8ebbbacadfc25f5c17a96f0233d84ac2b;hpb=7d5ada9329d101b59b55691dd2f63ce3e3860011;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java index 6c1bbec..73b2db6 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnToTranslationParser.java @@ -27,204 +27,205 @@ import com.hughes.android.dictionary.parser.WikiTokenizer; public final class EnToTranslationParser extends EnParser { public EnToTranslationParser(final IndexBuilder enIndexBuilder, - final IndexBuilder otherIndexBuilder, final Pattern langPattern, - final Pattern langCodePattern, final boolean swap) { - super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); + final IndexBuilder otherIndexBuilder, final Pattern langPattern, + final Pattern langCodePattern, final boolean swap) { + super(enIndexBuilder, otherIndexBuilder, langPattern, langCodePattern, swap); } @Override void parseSection(String heading, String text) { - if (isIgnorableTitle(title)) { - return; - } - heading = heading.replaceAll("=", "").trim(); - if (!heading.contains("English")) { - return; - } - - String pos = null; - int posDepth = -1; - - final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); - while (wikiTokenizer.nextToken() != null) { - - if (wikiTokenizer.isHeading()) { - final String headerName = wikiTokenizer.headingWikiText(); - - if (wikiTokenizer.headingDepth() <= posDepth) { - pos = null; - posDepth = -1; - } - - if (partOfSpeechHeader.matcher(headerName).matches()) { - posDepth = wikiTokenizer.headingDepth(); - pos = wikiTokenizer.headingWikiText(); - // TODO: if we're inside the POS section, we should handle the first title line... - - } else if (headerName.equals("Translations")) { - if (pos == null) { - LOG.info("Translations without POS (but using anyway): " + title); + if (isIgnorableTitle(title)) { + return; + } + heading = heading.replace("=", "").trim(); + if (!heading.contains("English")) { + return; + } + + String pos = null; + int posDepth = -1; + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(text); + while (wikiTokenizer.nextToken() != null) { + + if (wikiTokenizer.isHeading()) { + final String headerName = wikiTokenizer.headingWikiText(); + + if (wikiTokenizer.headingDepth() <= posDepth) { + pos = null; + posDepth = -1; + } + + if (partOfSpeechHeader.matcher(headerName).matches()) { + posDepth = wikiTokenizer.headingDepth(); + pos = wikiTokenizer.headingWikiText(); + // TODO: if we're inside the POS section, we should handle the first title line... + + } else if (headerName.equals("Translations")) { + if (pos == null) { + LOG.info("Translations without POS (but using anyway): " + title); + } + doTranslations(wikiTokenizer, pos); + } else if (headerName.equals("Pronunciation")) { + //doPronunciation(wikiLineReader); + } + } else if (wikiTokenizer.isFunction()) { + final String name = wikiTokenizer.functionName(); + if (name.equals("head") && pos == null) { + LOG.warning("{{head}} without POS: " + title); + } } - doTranslations(wikiTokenizer, pos); - } else if (headerName.equals("Pronunciation")) { - //doPronunciation(wikiLineReader); - } - } else if (wikiTokenizer.isFunction()) { - final String name = wikiTokenizer.functionName(); - if (name.equals("head") && pos == null) { - LOG.warning("{{head}} without POS: " + title); - } } - } } private void doTranslations(final WikiTokenizer wikiTokenizer, final String pos) { - if (title.equals("absolutely")) { - //System.out.println(); - } - - String topLevelLang = null; - String sense = null; - boolean done = false; - while (wikiTokenizer.nextToken() != null) { - if (wikiTokenizer.isHeading()) { - wikiTokenizer.returnToLineStart(); - return; - } - if (done) { - continue; + if (title.equals("absolutely")) { + //System.out.println(); } - - // Check whether we care about this line: - - if (wikiTokenizer.isFunction()) { - final String functionName = wikiTokenizer.functionName(); - final List positionArgs = wikiTokenizer.functionPositionArgs(); - - if (functionName.equals("trans-top")) { - sense = null; - if (wikiTokenizer.functionPositionArgs().size() >= 1) { - sense = positionArgs.get(0); - sense = WikiTokenizer.toPlainText(sense); - //LOG.info("Sense: " + sense); + + String topLevelLang = null; + String sense = null; + boolean done = false; + while (wikiTokenizer.nextToken() != null) { + if (wikiTokenizer.isHeading()) { + wikiTokenizer.returnToLineStart(); + return; + } + if (done) { + continue; } - } else if (functionName.equals("trans-bottom")) { - sense = null; - } else if (functionName.equals("trans-mid")) { - } else if (functionName.equals("trans-see")) { - incrementCount("WARNING:trans-see"); - } else if (functionName.startsWith("picdic")) { - } else if (functionName.startsWith("checktrans")) { - done = true; - } else if (functionName.startsWith("ttbc")) { - wikiTokenizer.nextLine(); - // TODO: would be great to handle ttbc - // TODO: Check this: done = true; - } else { - LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isListItem()) { - final String line = wikiTokenizer.listItemWikiText(); - // This line could produce an output... - + + // Check whether we care about this line: + + if (wikiTokenizer.isFunction()) { + final String functionName = wikiTokenizer.functionName(); + final List positionArgs = wikiTokenizer.functionPositionArgs(); + + if (functionName.equals("trans-top")) { + sense = null; + if (wikiTokenizer.functionPositionArgs().size() >= 1) { + sense = positionArgs.get(0); + sense = WikiTokenizer.toPlainText(sense); + //LOG.info("Sense: " + sense); + } + } else if (functionName.equals("trans-bottom")) { + sense = null; + } else if (functionName.equals("trans-mid")) { + } else if (functionName.equals("trans-see")) { + incrementCount("WARNING:trans-see"); + } else if (functionName.startsWith("picdic")) { + } else if (functionName.startsWith("checktrans")) { + done = true; + } else if (functionName.startsWith("ttbc")) { + wikiTokenizer.nextLine(); + // TODO: would be great to handle ttbc + // TODO: Check this: done = true; + } else { + LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isListItem()) { + final String line = wikiTokenizer.listItemWikiText(); + // This line could produce an output... + // if (line.contains("ich hoan dich gear")) { // //System.out.println(); // } - - // First strip the language and check whether it matches. - // And hold onto it for sub-lines. - final int colonIndex = line.indexOf(":"); - if (colonIndex == -1) { - continue; - } - - final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); - incrementCount("tCount:" + lang); - final boolean appendLang; - if (wikiTokenizer.listItemPrefix().length() == 1) { - topLevelLang = lang; - final boolean thisFind = langPattern.matcher(lang).find(); - if (!thisFind) { - continue; - } - appendLang = !langPattern.matcher(lang).matches(); - } else if (topLevelLang == null) { - continue; - } else { - // Two-level -- the only way we won't append is if this second level matches exactly. - if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { - continue; + + // First strip the language and check whether it matches. + // And hold onto it for sub-lines. + final int colonIndex = line.indexOf(":"); + if (colonIndex == -1) { + continue; + } + + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + incrementCount("tCount:" + lang); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { + continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } + + String rest = line.substring(colonIndex + 1).trim(); + if (rest.length() > 0) { + doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); + } + + } else if (wikiTokenizer.remainderStartsWith("''See''")) { + wikiTokenizer.nextLine(); + incrementCount("WARNING: ''See''" ); + LOG.fine("Skipping See line: " + wikiTokenizer.token()); + } else if (wikiTokenizer.isWikiLink()) { + final String wikiLink = wikiTokenizer.wikiLinkText(); + if (wikiLink.contains(":") && wikiLink.contains(title)) { + } else if (wikiLink.contains("Category:")) { + } else { + incrementCount("WARNING: Unexpected wikiLink" ); + LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); + } + } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { + } else { + final String token = wikiTokenizer.token(); + if (token.equals("----")) { + } else { + LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); + incrementCount("WARNING: Unexpected translation token" ); + } } - appendLang = !langPattern.matcher(lang).matches(); - } - - String rest = line.substring(colonIndex + 1).trim(); - if (rest.length() > 0) { - doTranslationLine(line, appendLang ? lang : null, pos, sense, rest); - } - - } else if (wikiTokenizer.remainderStartsWith("''See''")) { - wikiTokenizer.nextLine(); - incrementCount("WARNING: ''See''" ); - LOG.fine("Skipping See line: " + wikiTokenizer.token()); - } else if (wikiTokenizer.isWikiLink()) { - final String wikiLink = wikiTokenizer.wikiLinkText(); - if (wikiLink.contains(":") && wikiLink.contains(title)) { - } else if (wikiLink.contains("Category:")) { - } else { - incrementCount("WARNING: Unexpected wikiLink" ); - LOG.warning("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title); - } - } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) { - } else { - final String token = wikiTokenizer.token(); - if (token.equals("----")) { - } else { - LOG.warning("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title); - incrementCount("WARNING: Unexpected translation token" ); - } + } - - } } - + private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) { - state = State.TRANSLATION_LINE; - // Good chance we'll actually file this one... - final PairEntry pairEntry = new PairEntry(entrySource); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final StringBuilder foreignText = new StringBuilder(); - appendAndIndexWikiCallback.reset(foreignText, indexedEntry); - appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - - if (foreignText.length() == 0) { - LOG.warning("Empty foreignText: " + line); - incrementCount("WARNING: Empty foreignText" ); - return; - } - - if (lang != null) { - foreignText.insert(0, String.format("(%s) ", lang)); - } - - StringBuilder englishText = new StringBuilder(); - - englishText.append(title); - if (sense != null) { - englishText.append(" (").append(sense).append(")"); - enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); - } - if (pos != null) { - englishText.append(" (").append(pos.toLowerCase()).append(")"); - } - enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); - - final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); - pairEntry.pairs.add(pair); - if (!pairsAdded.add(pair.toString())) { - LOG.warning("Duplicate pair: " + pair.toString()); - incrementCount("WARNING: Duplicate pair" ); - } + state = State.TRANSLATION_LINE; + // Good chance we'll actually file this one... + final PairEntry pairEntry = new PairEntry(entrySource); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + indexedEntry.isValid = true; + + final StringBuilder foreignText = new StringBuilder(); + appendAndIndexWikiCallback.reset(foreignText, indexedEntry); + appendAndIndexWikiCallback.dispatch(rest, foreignIndexBuilder, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); + + if (foreignText.length() == 0) { + LOG.warning("Empty foreignText: " + line); + incrementCount("WARNING: Empty foreignText" ); + return; + } + + if (lang != null) { + foreignText.insert(0, String.format("(%s) ", lang)); + } + + StringBuilder englishText = new StringBuilder(); + + englishText.append(title); + if (sense != null) { + englishText.append(" (").append(sense).append(")"); + enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + } + if (pos != null) { + englishText.append(" (").append(pos.toLowerCase()).append(")"); + } + enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + final Pair pair = new Pair(trim(englishText.toString()), trim(foreignText.toString()), swap); + pairEntry.pairs.add(pair); + if (!pairsAdded.add(pair.toString())) { + LOG.warning("Duplicate pair: " + pair); + incrementCount("WARNING: Duplicate pair" ); + } } - } // EnToTranslationParser \ No newline at end of file +} // EnToTranslationParser