From ca23379690d2ec13909fc52044d8d65166bde27c Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Mon, 19 Dec 2011 13:10:13 -0800 Subject: [PATCH] Fixed handling of non top level languages inside Translations section. --- .../engine/DictionaryBuilderTest.java | 76 ++++++++++--------- .../parser/EnWiktionaryXmlParser.java | 37 ++++++--- 2 files changed, 70 insertions(+), 43 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 8059a1e..99e4e84 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -33,51 +33,60 @@ public class DictionaryBuilderTest extends TestCase { public static final String TEST_OUTPUTS = "testdata/outputs/"; - public void testWiktionaryItalianFromItalian() throws Exception { - final String name = "wiktionary.it_it.quickdic"; - final File result = new File(TEST_OUTPUTS + name); - System.out.println("Writing to: " + result); - DictionaryBuilder.main(new String[] { - "--dictOut=" + result.getAbsolutePath(), - "--lang1=IT", - "--lang2=EN", - "--lang1Stoplist=" + STOPLISTS + "it.txt", - "--lang2Stoplist=" + STOPLISTS + "en.txt", - "--dictInfo=SomeWikiData", + // Chinese + public void testWiktionary_ZH_ZH() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.zh_zh.quickdic", "ZH", "empty.txt", + "ZH.data", "enwiktionary.chinese", "Chinese|Mandarin|Cantonese", "zh"); + } - "--input4=" + WIKISPLIT + "IT.data", - "--input4Name=enwiktionary.italian", - "--input4Format=enwiktionary", - "--input4LangPattern=Italian", - "--input4LangCodePattern=it", - "--input4EnIndex=2", - "--input4PageLimit=1000", + public void testWiktionary_ZH_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt", + "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantonese", "zh"); + } - "--print=" + result.getPath() + ".text", - }); - - checkGolden(name, result); + + // German + public void testWiktionary_DE_DE() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt", + "DE.data", "enwiktionary.german", "German", "it"); + } + + public void testWiktionary_DE_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt", + "EN.data", "enwiktionary.english", "German", "it"); + } + + // Italian + public void testWiktionary_IT_IT() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt", + "IT.data", "enwiktionary.italian", "Italian", "it"); } - public void testWiktionaryItalianFromEnglish() throws Exception { - final String name = "wiktionary.it_en.quickdic"; + public void testWiktionary_IT_EN() throws Exception { + wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt", + "EN.data", "enwiktionary.english", "Italian", "it"); + } + + public void wiktionaryTestWithLangToEn(final String name, final String lang1, + final String stoplist, final String data, final String dictName, + final String langPattern, final String langCode) throws Exception { final File result = new File(TEST_OUTPUTS + name); System.out.println("Writing to: " + result); DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), - "--lang1=IT", + "--lang1=" + lang1, "--lang2=EN", - "--lang1Stoplist=" + STOPLISTS + "it.txt", + "--lang1Stoplist=" + STOPLISTS + stoplist, "--lang2Stoplist=" + STOPLISTS + "en.txt", "--dictInfo=SomeWikiData", - "--input3=" + WIKISPLIT + "EN.data", - "--input3Name=enwiktionary.english", - "--input3Format=enwiktionary", - "--input3LangPattern=Italian", - "--input3LangCodePattern=it", - "--input3EnIndex=2", - "--input3PageLimit=1000", + "--input4=" + WIKISPLIT + data, + "--input4Name=" + dictName, + "--input4Format=enwiktionary", + "--input4LangPattern=" + langPattern, + "--input4LangCodePattern=" + langCode, + "--input4EnIndex=2", + "--input4PageLimit=1000", "--print=" + result.getPath() + ".text", }); @@ -85,7 +94,6 @@ public class DictionaryBuilderTest extends TestCase { checkGolden(name, result); } - public void testGermanCombined() throws Exception { final String name = "de-en.quickdic"; final File result = new File(TEST_OUTPUTS + name); diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 6a6f438..a7bf170 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -165,9 +165,10 @@ public class EnWiktionaryXmlParser { private void doTranslations(final String title, final WikiTokenizer wikiTokenizer, final String pos) { if (title.equals("absolutely")) { - System.out.println(); + //System.out.println(); } + String topLevelLang = null; String sense = null; boolean done = false; while (wikiTokenizer.nextToken() != null) { @@ -181,8 +182,6 @@ public class EnWiktionaryXmlParser { // Check whether we care about this line: - //line = WikiLineReader.removeSquareBrackets(line); - if (wikiTokenizer.isFunction()) { final String functionName = wikiTokenizer.functionName(); final List positionArgs = wikiTokenizer.functionPositionArgs(); @@ -213,6 +212,10 @@ public class EnWiktionaryXmlParser { final String line = wikiTokenizer.listItemWikiText(); // This line could produce an output... + if (line.contains("ich hoan dich gear")) { + System.out.println(); + } + // First strip the language and check whether it matches. // And hold onto it for sub-lines. final int colonIndex = line.indexOf(":"); @@ -220,16 +223,28 @@ public class EnWiktionaryXmlParser { continue; } - final String lang = line.substring(0, colonIndex); - if (!this.langPattern.matcher(lang).find()) { + final String lang = trim(WikiTokenizer.toPlainText(line.substring(0, colonIndex))); + final boolean appendLang; + if (wikiTokenizer.listItemPrefix().length() == 1) { + topLevelLang = lang; + final boolean thisFind = langPattern.matcher(lang).find(); + if (!thisFind) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); + } else if (topLevelLang == null) { continue; + } else { + // Two-level -- the only way we won't append is if this second level matches exactly. + if (!langPattern.matcher(lang).matches() && !langPattern.matcher(topLevelLang).find()) { + continue; + } + appendLang = !langPattern.matcher(lang).matches(); } String rest = line.substring(colonIndex + 1).trim(); if (rest.length() > 0) { - doTranslationLine(line, title, pos, sense, rest); - } else { - // TODO: do lines that are like "Greek:" + doTranslationLine(line, appendLang ? lang : null, title, pos, sense, rest); } } else if (wikiTokenizer.remainderStartsWith("''See''")) { @@ -258,7 +273,7 @@ public class EnWiktionaryXmlParser { return index < list.size() ? list.get(index) : null; } - private void doTranslationLine(final String line, final String title, final String pos, final String sense, final String rest) { + private void doTranslationLine(final String line, final String lang, final String title, final String pos, final String sense, final String rest) { // Good chance we'll actually file this one... final PairEntry pairEntry = new PairEntry(); final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); @@ -363,6 +378,10 @@ public class EnWiktionaryXmlParser { return; } + if (lang != null) { + otherText.insert(0, "(" + lang + ") "); + } + StringBuilder englishText = new StringBuilder(); englishText.append(title); -- 2.43.0