From: Thad Hughes Date: Wed, 14 Dec 2011 19:09:34 +0000 (-0800) Subject: Reworking handling of foreign section. X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=d49dab6bd67bd257fc05dba9f288c5ada3bd2071;p=DictionaryPC.git Reworking handling of foreign section. --- diff --git a/bugs b/bugs new file mode 100644 index 0000000..b146b34 --- /dev/null +++ b/bugs @@ -0,0 +1,7 @@ + Alfredo {{it-proper noun|g=m}} :: , equivalent to English Alfred. + + +in wiktionary + futurismo :: futurism () (noun) + + \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 90c4b88..cf5a5d1 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -42,9 +42,9 @@ public class DictionaryBuilderMain extends TestCase { //new Lang("^German$", "DE"), }; Lang[] langs2 = new Lang[] { -// new Lang("^.*Greek.*$", "EL"), - new Lang("^.*Spanish.*$", "ES"), new Lang("^.*Italian.*$", "IT"), + new Lang("^.*Greek.*$", "EL"), + new Lang("^.*Spanish.*$", "ES"), /* new Lang("^German$", "DE"), new Lang("^Afrikaans$", "AF"), diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 77b240a..c6dcccb 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -15,6 +15,7 @@ package com.hughes.android.dictionary.engine; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.io.RandomAccessFile; @@ -30,10 +31,10 @@ public class DictionaryBuilderTest extends TestCase { public static final String GOLDENS = "../DictionaryData/testdata/goldens/"; public static final String TEST_OUTPUTS = "../DictionaryData/testdata/outputs/"; - public static final String OUTPUTS = "../DictionaryData/outputs/"; - public void testWiktionaryItalian() throws Exception { - final File result = new File(TEST_OUTPUTS + "wiktionary.it.quickdic"); + public void testWiktionaryItalianFromItalian() throws Exception { + final String name = "wiktionary.it_it.quickdic"; + final File result = new File(TEST_OUTPUTS + name); System.out.println("Writing to: " + result); DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), @@ -41,15 +42,6 @@ public class DictionaryBuilderTest extends TestCase { "--lang2=EN", "--dictInfo=SomeWikiData", - /* - "--input3=" + WIKISPLIT + "english.data", - "--input3Name=enwiktionary.english", - "--input3Format=enwiktionary", - "--input3LangPattern=Italian", - "--input3LangCodePattern=it", - "--input3EnIndex=2", - "--input3PageLimit=1000", -*/ "--input4=" + WIKISPLIT + "italian.data", "--input4Name=enwiktionary.italian", "--input4Format=enwiktionary", @@ -58,24 +50,40 @@ public class DictionaryBuilderTest extends TestCase { "--input4EnIndex=2", "--input4PageLimit=1000", - "--print=" + result.getName() + ".text", + "--print=" + result.getPath() + ".text", }); - // Check it once: - assertFilesEqual(GOLDENS + "wiktionary.it_it.quickdic.text", result.getName() + ".text"); - - // Check it again. - final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r")); - final PrintStream out = new PrintStream(new File(result.getName() + ".text")); - dict.print(out); - out.close(); + checkGolden(name, result); + } + + public void testWiktionaryItalianFromEnglish() throws Exception { + final String name = "wiktionary.it_en.quickdic"; + final File result = new File(TEST_OUTPUTS + name); + System.out.println("Writing to: " + result); + DictionaryBuilder.main(new String[] { + "--dictOut=" + result.getAbsolutePath(), + "--lang1=IT", + "--lang2=EN", + "--dictInfo=SomeWikiData", + + "--input3=" + WIKISPLIT + "english.data", + "--input3Name=enwiktionary.english", + "--input3Format=enwiktionary", + "--input3LangPattern=Italian", + "--input3LangCodePattern=it", + "--input3EnIndex=2", + "--input3PageLimit=1000", + + "--print=" + result.getPath() + ".text", + }); - assertFilesEqual(GOLDENS + "wiktionary.it_it.quickdic.text", result.getName() + ".text"); + checkGolden(name, result); } public void testGermanCombined() throws Exception { - final File result = new File(TEST_OUTPUTS + "de-en.quickdic"); + final String name = "de-en.quickdic"; + final File result = new File(TEST_OUTPUTS + name); System.out.println("Writing to: " + result); DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), @@ -93,23 +101,26 @@ public class DictionaryBuilderTest extends TestCase { "--input2Charset=UTF8", "--input2Format=dictcc", - "--print=" + result.getName() + ".text", + "--print=" + result.getPath() + ".text", }); + checkGolden(name, result); + } + + private void checkGolden(final String dictName, final File dictFile) + throws IOException, FileNotFoundException { // Check it once: - assertFilesEqual(GOLDENS + "de-en.quickdic.text", result.getName() + ".text"); - + assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); + // Check it again. - final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r")); - final PrintStream out = new PrintStream(result.getName() + ".text"); + final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r")); + final PrintStream out = new PrintStream(new File(dictFile.getName() + ".text")); dict.print(out); out.close(); - - assertFilesEqual(GOLDENS + "de-en.quickdic.text", result.getName() + ".text"); + assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text"); } - void assertFilesEqual(final String expected, final String actual) throws IOException { final String expectedString = FileUtil.readToString(new File(expected)); final String actualString = FileUtil.readToString(new File(actual)); diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index 1d6300b..0451c7e 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -21,13 +21,12 @@ import java.util.concurrent.atomic.AtomicBoolean; import junit.framework.TestCase; import com.hughes.android.dictionary.engine.Index.IndexEntry; -import com.hughes.android.dictionary.engine.PairEntry.Row; public class DictionaryTest extends TestCase { static final String TEST_OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.TEST_OUTPUTS; - static final String OUTPUTS = com.hughes.android.dictionary.engine.DictionaryBuilderTest.OUTPUTS; + public static final String OUTPUTS = "../DictionaryData/outputs/"; @Override protected void setUp() { diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index f6204f6..54a11e3 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -162,6 +162,10 @@ public class EnWiktionaryXmlParser { "yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j")); private void doTranslations(final String title, final WikiTokenizer wikiTokenizer) { + if (title.equals("absolutely")) { + System.out.println(); + } + String sense = null; boolean done = false; while (wikiTokenizer.nextToken() != null) { @@ -304,17 +308,13 @@ public class EnWiktionaryXmlParser { if (!namedArgs.isEmpty() || args.size() > 1) { LOG.warning("weird qualifier: " + line); } + // Unindexed! otherText.append("(").append(qualifier).append(")"); } else if (encodings.contains(functionName)) { otherText.append("").append(args.get(0)); otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); - } else if (functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p")) { - otherText.append("{"); - otherText.append(functionName); - for (int i = 0; i < args.size(); ++i) { - otherText.append("|").append(args.get(i)); - } - otherText.append("}"); + } else if (isGender(functionName)) { + appendGender(otherText, functionName, args); } else if (functionName.equals("g")) { otherText.append("{g}"); } else if (functionName.equals("l")) { @@ -333,15 +333,15 @@ public class EnWiktionaryXmlParser { otherText.append("[").append(args.get(0)).append("]"); otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT); } else if (functionName.equals("ttbc")) { + LOG.warning("Unexpected {{ttbc}}"); } else if (functionName.equals("trreq")) { } else if (functionName.equals("not used")) { otherText.append("(not used)"); } else if (functionName.equals("t-image")) { // American sign language - } else if (args.isEmpty() && namedArgs.isEmpty()) { - otherText.append("{UNK. FUNC.: ").append(functionName).append("}"); } else { - LOG.warning("Unexpected t+- wikifunction: " + line + ", title=" + title); + // Unindexed! + otherText.append(wikiTokenizer.token()); } } else if (wikiTokenizer.isNewline()) { @@ -379,11 +379,21 @@ public class EnWiktionaryXmlParser { } } - - static final Pattern whitespace = Pattern.compile("\\s+"); - static String trim(final String s) { - return whitespace.matcher(s).replaceAll(" ").trim(); + + private void appendGender(final StringBuilder otherText, + final String functionName, final List args) { + otherText.append("{"); + otherText.append(functionName); + for (int i = 0; i < args.size(); ++i) { + otherText.append("|").append(args.get(i)); + } + otherText.append("}"); + } + + + private boolean isGender(final String functionName) { + return functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p"); } Set pairsAdded = new LinkedHashSet(); @@ -400,20 +410,44 @@ public class EnWiktionaryXmlParser { } else if (headingName.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } else if (partOfSpeechHeader.matcher(headingName).matches()) { - doPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); + doForeignPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer); } } else { } } } + + static final class ListSection { + final String firstPrefix; + final String firstLine; + final List nextPrefixes = new ArrayList(); + final List nextLines = new ArrayList(); + + public ListSection(String firstPrefix, String firstLine) { + this.firstPrefix = firstPrefix; + this.firstLine = firstLine; + } + @Override + public String toString() { + return firstPrefix + firstLine + "{ " + nextPrefixes + "}"; + } + } - private void doPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { + + private void doForeignPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { LOG.info("***" + title + ", pos=" + posHeading); - //final StringBuilder foreignBuilder = new StringBuilder(); + if (title.equals("moro")) { + System.out.println(); + } + + final StringBuilder foreignBuilder = new StringBuilder(); + Collection wordForms = Collections.emptyList(); + final List listSections = new ArrayList(); + + try { - String side = null; - Collection forms = Collections.emptyList(); + ListSection lastListSection = null; int currentHeadingDepth = posDepth; while (wikiTokenizer.nextToken() != null) { @@ -448,18 +482,30 @@ public class EnWiktionaryXmlParser { // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!) // for the conjugation table from "fa". // Would like to be able to link to a lang#token. - if (name.equals("it-noun")) { - assert forms.isEmpty(); + if (isGender(name)) { + appendGender(foreignBuilder, name, args); + } else if (name.equals("wikipedia")) { + namedArgs.remove("lang"); + if (args.size() > 1 || !namedArgs.isEmpty()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + } else if (args.size() == 1) { + foreignBuilder.append(wikiTokenizer.token()); + } else { + //foreignBuilder.append(title); + } + } else if (name.equals("it-noun")) { + assert wordForms.isEmpty(); final String base = get(args, 0); final String gender = get(args, 1); final String singular = base + get(args, 2); final String plural = base + get(args, 3); - side = String.format("%s {%s}, %s {pl}", singular, gender, plural, plural); - forms = Arrays.asList(singular, plural); + foreignBuilder.append(String.format("%s {%s}, %s {pl}", singular, gender, plural, plural)); + wordForms = Arrays.asList(singular, plural); } else if (name.equals("it-proper noun")) { - // TODO + foreignBuilder.append(wikiTokenizer.token()); } else if (name.equals("it-adj")) { - // TODO + foreignBuilder.append(wikiTokenizer.token()); } else if (name.startsWith("it-conj")) { if (name.equals("it-conj-are")) { itConjAre(args, namedArgs); @@ -468,22 +514,132 @@ public class EnWiktionaryXmlParser { } else { LOG.warning("Unknown conjugation: " + wikiTokenizer.token()); } - } else { - LOG.warning("Unknown function: " + wikiTokenizer.token()); + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + // LOG.warning("Unknown function: " + wikiTokenizer.token()); } } else if (wikiTokenizer.isListItem()) { - handleForeignListItem(side != null ? side : title, title, forms, wikiTokenizer); - + final String prefix = wikiTokenizer.listItemPrefix(); + if (lastListSection != null && + prefix.startsWith(lastListSection.firstPrefix) && + prefix.length() > lastListSection.firstPrefix.length()) { + lastListSection.nextPrefixes.add(prefix); + lastListSection.nextLines.add(wikiTokenizer.listItemWikiText()); + } else { + lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText()); + listSections.add(lastListSection); + } + } else if (lastListSection != null) { + // Don't append anything after the lists, because there's crap. } else if (wikiTokenizer.isWikiLink()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.wikiLinkText()); + + } else if (wikiTokenizer.isPlainText()) { + // Unindexed! + foreignBuilder.append(wikiTokenizer.token()); + + } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) { + // Do nothing. + } else { + LOG.warning("Unexpected token: " + wikiTokenizer.token()); + } + } + + } finally { + // Here's where we exit. + // TODO: Should we make an entry even if there are no foreign list items? + if (foreignBuilder.indexOf(title) == -1) { + foreignBuilder.insert(0, title + " "); + } + for (final ListSection listSection : listSections) { + doForeignListItem(foreignBuilder.toString(), title, wordForms, listSection); + } + } + } + + + static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile( + "(first|second|third)-person (singular|plural)|" + + "present tense|" + + "imperative" + ); + + private void doForeignListItem(final String foreignText, String title, final Collection forms, final ListSection listSection) { + + final String prefix = listSection.firstPrefix; + if (prefix.length() > 1) { + LOG.warning("Prefix too long: " + listSection); + return; + } + + final PairEntry pairEntry = new PairEntry(); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + final StringBuilder englishBuilder = new StringBuilder(); + + final String mainLine = listSection.firstLine; + + final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); + while (englishTokenizer.nextToken() != null) { + // TODO handle form of.... + if (englishTokenizer.isPlainText()) { + englishBuilder.append(englishTokenizer.token()); + enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF); + } else if (englishTokenizer.isWikiLink()) { + final String text = englishTokenizer.wikiLinkText(); + final String link = englishTokenizer.wikiLinkDest(); + if (link != null) { + if (link.contains("#English")) { + englishBuilder.append(text); + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } else if (link.contains("#") && this.langPattern.matcher(link).find()) { + englishBuilder.append(text); + otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else { + LOG.warning("Special link: " + englishTokenizer.token()); + // TODO: something here... + } + } else { + // link == null + englishBuilder.append(text); + if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) { + enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); + } + } + } else if (englishTokenizer.isFunction()) { + final String name = englishTokenizer.functionName(); + if (name.contains(" conjugation of ") || + name.contains(" form of ") || + name.contains(" feminine of ") || + name.contains(" plural of ")) { + // Ignore these in the index, they're really annoying.... + englishBuilder.append(englishTokenizer.token()); + } else { + LOG.warning("Unexpected function: " + englishTokenizer.token()); + } } else { + if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { + } else { + LOG.warning("Unexpected definition text: " + englishTokenizer.token()); + } + } + } + final String english = trim(englishBuilder.toString()); + if (english.length() > 0) { + final Pair pair = new Pair(english, trim(foreignText), this.swap); + pairEntry.pairs.add(pair); + otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + for (final String form : forms) { + otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); } - } } + private void itConjAre(List args, Map namedArgs) { final String base = args.get(0); final String aux = args.get(1); @@ -579,107 +735,9 @@ public class EnWiktionaryXmlParser { } } - final List listPrefixes = new ArrayList(); - final List listLines = new ArrayList(); - -static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile( - "(first|second|third)-person (singular|plural)|" + - "present tense|" + - "imperative" - ); - - private void handleForeignListItem(final String foreignText, String title, final Collection forms, final WikiTokenizer wikiTokenizer) { - - final String prefix = wikiTokenizer.listItemPrefix(); - if (prefix.length() > 1) { - LOG.warning("Prefix too long: " + wikiTokenizer.token()); - return; - } - - listPrefixes.clear(); - listLines.clear(); - listPrefixes.add(prefix); - listLines.add(wikiTokenizer.listItemWikiText()); - while(wikiTokenizer.nextToken() != null && - wikiTokenizer.isNewline() || - wikiTokenizer.isComment() || - (wikiTokenizer.isListItem() && - wikiTokenizer.listItemPrefix().length() > prefix.length() && - wikiTokenizer.listItemPrefix().startsWith(prefix))) { - if (wikiTokenizer.isListItem()) { - listPrefixes.add(wikiTokenizer.listItemPrefix()); - listLines.add(wikiTokenizer.listItemWikiText()); - } - } - if (wikiTokenizer.nextToken() != null) { - wikiTokenizer.returnToLineStart(); - } - LOG.info("list lines: " + listLines); - LOG.info("list prefixes: " + listPrefixes); - - final PairEntry pairEntry = new PairEntry(); - final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); - - final String foreign = trim(title); - - final StringBuilder englishBuilder = new StringBuilder(); - - final String mainLine = listLines.get(0); - - final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false); - while (englishTokenizer.nextToken() != null) { - // TODO handle form of.... - if (englishTokenizer.isPlainText()) { - englishBuilder.append(englishTokenizer.token()); - enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF); - } else if (englishTokenizer.isWikiLink()) { - final String text = englishTokenizer.wikiLinkText(); - final String link = englishTokenizer.wikiLinkDest(); - if (link != null) { - if (link.contains("#English")) { - englishBuilder.append(text); - enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); - } else if (link.contains("#") && this.langPattern.matcher(link).find()) { - englishBuilder.append(text); - otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); - } else { - LOG.warning("Special link: " + englishTokenizer.token()); - // TODO: something here... - } - } else { - // link == null - englishBuilder.append(text); - if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) { - enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK); - } - } - } else if (englishTokenizer.isFunction()) { - final String name = englishTokenizer.functionName(); - if (name.contains(" conjugation of ") || - name.contains(" form of ") || - name.contains(" feminine of ") || - name.contains(" plural of ")) { - // Ignore these in the index, they're really annoying.... - englishBuilder.append(englishTokenizer.token()); - } else { - LOG.warning("Unexpected function: " + englishTokenizer.token()); - } - } else { - if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { - } else { - LOG.warning("Unexpected definition text: " + englishTokenizer.token()); - } - } - } - final String english = trim(englishBuilder.toString()); - if (english.length() > 0) { - final Pair pair = new Pair(english, trim(foreignText), this.swap); - pairEntry.pairs.add(pair); - otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); - for (final String form : forms) { - otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI); - } - } + static final Pattern whitespace = Pattern.compile("\\s+"); + static String trim(final String s) { + return whitespace.matcher(s).replaceAll(" ").trim(); }