From 5153e690290dad796114968254d16e8cb542f7af Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Wed, 14 Dec 2011 12:04:30 -0800 Subject: [PATCH] Fixing. --- .../engine/DictionaryBuilderMain.java | 69 ++++++++++--------- .../parser/EnWiktionaryXmlParser.java | 30 ++++---- 2 files changed, 55 insertions(+), 44 deletions(-) diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index cf5a5d1..2369915 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -25,12 +25,17 @@ import junit.framework.TestCase; public class DictionaryBuilderMain extends TestCase { + static final String INPUTS = "../DictionaryData/inputs/"; + static final String OUTPUTS = "../DictionaryData/outputs/"; + static class Lang { final String nameRegex; - final String code; - public Lang(String nameRegex, String code) { + final String isoCode; + final String wikiSplit; + public Lang(String nameRegex, String code, final String wikiSplit) { this.nameRegex = nameRegex; - this.code = code; + this.isoCode = code; + this.wikiSplit = wikiSplit; } } @@ -38,13 +43,12 @@ public class DictionaryBuilderMain extends TestCase { public static void main(final String[] args) throws Exception { Lang[] langs1 = new Lang[] { - new Lang("^English$", "EN"), - //new Lang("^German$", "DE"), + new Lang("^English$", "EN", null), }; Lang[] langs2 = new Lang[] { - new Lang("^.*Italian.*$", "IT"), - new Lang("^.*Greek.*$", "EL"), - new Lang("^.*Spanish.*$", "ES"), + new Lang("^.*Italian.*$", "IT", "italian.data"), + new Lang("^.*Greek.*$", "EL", "greek.data"), + new Lang("^.*Spanish.*$", "ES", "spanish.data"), /* new Lang("^German$", "DE"), new Lang("^Afrikaans$", "AF"), @@ -97,29 +101,36 @@ public class DictionaryBuilderMain extends TestCase { int enIndex = -1; Lang nonEnglish = null; - if (lang2.code.equals("EN")) { + if (lang2.isoCode.equals("EN")) { enIndex = 2; nonEnglish = lang1; } - if (lang1.code.equals("EN")) { + if (lang1.isoCode.equals("EN")) { enIndex = 1; nonEnglish = lang2; } assert nonEnglish != null; - final String dictFile = String.format("dictOutputs/%s-%s_enwiktionary.quickdic", lang1.code, lang2.code); + final String dictFile = String.format(OUTPUTS + "/%s-%s_enwiktionary.quickdic", lang1.isoCode, lang2.isoCode); System.out.println("building dictFile: " + dictFile); DictionaryBuilder.main(new String[] { String.format("--dictOut=%s", dictFile), - String.format("--lang1=%s", lang1.code), - String.format("--lang2=%s", lang2.code), - String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.code, lang2.code), - - "--input3=wikiSplit/english.data", + String.format("--lang1=%s", lang1.isoCode), + String.format("--lang2=%s", lang2.isoCode), + String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.isoCode, lang2.isoCode), + + "--input2=" + INPUTS + "wikiSplit/" + nonEnglish.wikiSplit, + "--input2Name=enwiktionary." + nonEnglish.wikiSplit, + "--input2Format=enwiktionary", + "--input2LangPattern=" + nonEnglish.nameRegex, + "--input2LangCodePattern=" + nonEnglish.isoCode.toLowerCase(), + "--input2EnIndex=" + enIndex, + + "--input3=" + INPUTS + "wikiSplit/english.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=" + nonEnglish.nameRegex, - "--input3LangCodePattern=" + (enIndex == 1 ? lang2.code : lang1.code).toLowerCase(), + "--input3LangCodePattern=" + (enIndex == 1 ? lang2.isoCode : lang1.isoCode).toLowerCase(), "--input3EnIndex=" + enIndex, }); @@ -127,7 +138,7 @@ public class DictionaryBuilderMain extends TestCase { // Print the entries for diffing. final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r"); final Dictionary dict = new Dictionary(raf); - final PrintWriter textOut = new PrintWriter(new File(dictFile + ".txt")); + final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text")); final List sorted = new ArrayList(dict.pairEntries); Collections.sort(sorted); for (final PairEntry pairEntry : sorted) { @@ -140,40 +151,34 @@ public class DictionaryBuilderMain extends TestCase { } // langs1 DictionaryBuilder.main(new String[] { - "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic", + "--dictOut=" + OUTPUTS + "DE-EN_chemnitz.quickdic", "--lang1=DE", "--lang2=EN", - "--dictInfo=@dictInputs/de-en_chemnitz.info", + "--dictInfo=@" + INPUTS + "de-en_chemnitz.info", - "--input1=dictInputs/de-en_chemnitz.txt", + "--input1=" + INPUTS + "de-en_chemnitz.txt", "--input1Name=chemnitz", "--input1Charset=UTF8", "--input1Format=chemnitz", }); DictionaryBuilder.main(new String[] { - "--dictOut=dictOutputs/de-en_all.quickdic", + "--dictOut=" + OUTPUTS + "de-en_all.quickdic", "--lang1=DE", "--lang2=EN", - "--dictInfo=@dictInputs/de-en_all.info", + "--dictInfo=@" + INPUTS + "de-en_all.info", - "--input2=dictInputs/de-en_chemnitz.txt", + "--input2=" + INPUTS + "de-en_chemnitz.txt", "--input2Name=dictcc", "--input2Charset=UTF8", "--input2Format=chemnitz", - "--input3=dictInputs/de-en_dictcc.txt", + "--input3=" + INPUTS + "/copyrighted/de-en_dictcc.txt", "--input3Name=dictcc", "--input3Charset=UTF8", "--input3Format=dictcc", - "--input1=dictInputs/enwiktionary-20101015-pages-articles", - "--input1Name=enwiktionary", - "--input1Format=enwiktionary", - "--input1TranslationPattern1=^German$", - "--input1TranslationPattern2=^English$", - "--input1EnIndex=2", - + // TODO: wiktionary }); } diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 54a11e3..6eac29d 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -435,14 +435,17 @@ public class EnWiktionaryXmlParser { } + int foreignCount = 0; private void doForeignPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) { - LOG.info("***" + title + ", pos=" + posHeading); + if (++foreignCount % 1000 == 0) { + LOG.info("***" + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount); + } if (title.equals("moro")) { System.out.println(); } final StringBuilder foreignBuilder = new StringBuilder(); - Collection wordForms = Collections.emptyList(); + final Collection wordForms = new ArrayList(); final List listSections = new ArrayList(); try { @@ -495,13 +498,13 @@ public class EnWiktionaryXmlParser { //foreignBuilder.append(title); } } else if (name.equals("it-noun")) { - assert wordForms.isEmpty(); final String base = get(args, 0); final String gender = get(args, 1); final String singular = base + get(args, 2); final String plural = base + get(args, 3); - foreignBuilder.append(String.format("%s {%s}, %s {pl}", singular, gender, plural, plural)); - wordForms = Arrays.asList(singular, plural); + foreignBuilder.append(String.format(" %s {%s}, %s {pl}", singular, gender, plural, plural)); + wordForms.add(singular); + wordForms.add(plural); } else if (name.equals("it-proper noun")) { foreignBuilder.append(wikiTokenizer.token()); } else if (name.equals("it-adj")) { @@ -599,9 +602,11 @@ public class EnWiktionaryXmlParser { } else if (link.contains("#") && this.langPattern.matcher(link).find()) { englishBuilder.append(text); otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG); + } else if (link.equals("plural")) { + englishBuilder.append(englishTokenizer.wikiLinkText()); } else { - LOG.warning("Special link: " + englishTokenizer.token()); - // TODO: something here... + //LOG.warning("Special link: " + englishTokenizer.token()); + englishBuilder.append(englishTokenizer.wikiLinkText()); } } else { // link == null @@ -612,14 +617,15 @@ public class EnWiktionaryXmlParser { } } else if (englishTokenizer.isFunction()) { final String name = englishTokenizer.functionName(); - if (name.contains(" conjugation of ") || - name.contains(" form of ") || - name.contains(" feminine of ") || - name.contains(" plural of ")) { + if (name.contains("conjugation of ") || + name.contains("form of ") || + name.contains("feminine of ") || + name.contains("plural of ")) { // Ignore these in the index, they're really annoying.... englishBuilder.append(englishTokenizer.token()); } else { - LOG.warning("Unexpected function: " + englishTokenizer.token()); + englishBuilder.append(englishTokenizer.token()); +// LOG.warning("Unexpected function: " + englishTokenizer.token()); } } else { if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) { -- 2.43.0