From 297e7670b0c1487cdddb82dd2259f902d4ed80ae Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Sun, 15 Jan 2012 16:08:07 -0800 Subject: [PATCH] Changing the way dictionaries are indexed (listed), new type of TokenRow (to distinguish major from minor entries). --- .../engine/CheckDictionariesMain.java | 73 ++++++++++++++++++ .../dictionary/engine/DictionaryBuilder.java | 4 +- .../engine/DictionaryBuilderMain.java | 47 ++++-------- .../dictionary/engine/IndexBuilder.java | 20 +++-- .../dictionary/engine/LanguageTest.java | 8 +- .../dictionary/engine/WiktionarySplitter.java | 4 +- .../enwiktionary/EnWiktionaryLangs.java | 74 +++++++++++++++++++ todo.txt | 2 + 8 files changed, 184 insertions(+), 48 deletions(-) create mode 100644 src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java create mode 100644 src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java diff --git a/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java new file mode 100644 index 0000000..97cfeef --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/CheckDictionariesMain.java @@ -0,0 +1,73 @@ +package com.hughes.android.dictionary.engine; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import com.hughes.android.dictionary.DictionaryInfo; +import com.hughes.android.dictionary.engine.Index.IndexEntry; + + +public class CheckDictionariesMain { + + public static void main(String[] args) throws IOException { + final File dictDir = new File(DictionaryBuilderMain.OUTPUTS); + + final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt")); + dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2"); + + final File[] files = dictDir.listFiles(); + Arrays.sort(files); + for (final File dictFile : files) { + if (!dictFile.getName().endsWith("quickdic")) { + continue; + } + System.out.println(dictFile.getPath()); + + final DictionaryInfo dictionaryInfo = new DictionaryInfo(); + + final RandomAccessFile raf = new RandomAccessFile(dictFile, "r"); + final Dictionary dict = new Dictionary(raf); + + dictionaryInfo.uncompressedFilename = dictFile.getName(); + dictionaryInfo.uncompressedSize = dictFile.length(); + + // Print it. + final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text")); + final List sorted = new ArrayList(dict.pairEntries); + Collections.sort(sorted); + for (final PairEntry pairEntry : sorted) { + textOut.println(pairEntry.getRawText(false)); + } + textOut.close(); + + // Find the stats. + System.out.println("Stats..."); + for (int i = 0; i < 2; ++i) { + dictionaryInfo.langIsos[i] = dict.indices.get(i).sortLanguage.getIsoCode(); + final Index index = dict.indices.get(i); + for (final IndexEntry indexEntry : index.sortedIndexEntries) { + final TokenRow tokenRow = (TokenRow) index.rows.get(indexEntry.startRow); + dictionaryInfo.allTokenCounts[i]++; + if (tokenRow.hasMainEntry) { + dictionaryInfo.mainTokenCounts[i]++; + } + } + } + + raf.close(); + + dictionaryInfoOut.println(dictionaryInfo.toTabSeparatedString()); + dictionaryInfoOut.flush(); + System.out.println(dictionaryInfo.toTabSeparatedString() + "\n"); + } + + dictionaryInfoOut.close(); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index a3cc7c0..2db5721 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -43,8 +43,8 @@ public class DictionaryBuilder { public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set lang1Stoplist, final Set lang2Stoplist) { dictionary = new Dictionary(dictInfo); - indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, lang1Stoplist, false)); - indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, lang2Stoplist, true)); + indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false)); + indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true)); } void build() { diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 175b7a2..72ea6af 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -14,35 +14,33 @@ package com.hughes.android.dictionary.engine; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.RandomAccessFile; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; import java.util.LinkedHashMap; -import java.util.List; import java.util.Map; import junit.framework.TestCase; +import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; + public class DictionaryBuilderMain extends TestCase { static final String INPUTS = "data/inputs/"; static final String STOPLISTS = "data/inputs/stoplists/"; - static final String OUTPUTS = "data/outputs/"; - + static final String OUTPUTS = "data/outputs/"; + + static final String VERSION_SUFFIX = "v002"; + + public static void main(final String[] args) throws Exception { - final Map isoToWikiName = new LinkedHashMap(Language.isoCodeToWikiName); + // Builds all the dictionaries it can, outputs list to a text file. + + final Map isoToWikiName = new LinkedHashMap(EnWiktionaryLangs.isoCodeToWikiName); isoToWikiName.remove("EN"); isoToWikiName.remove("DE"); final Map isoToDedication = new LinkedHashMap(); isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn."); - isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja."); + isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja."); isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau."); // German handled in file. isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge."); @@ -77,7 +75,7 @@ public class DictionaryBuilderMain extends TestCase { continue; } - final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso); + final String dictFile = String.format("%s/EN-%s_enwiktionary.%s.quickdic", OUTPUTS, foreignIso, VERSION_SUFFIX); System.out.println("building dictFile: " + dictFile); if (!isoToStoplist.containsKey(foreignIso)) { @@ -114,12 +112,9 @@ public class DictionaryBuilderMain extends TestCase { }); - // Print the entries for diffing. - printToText(dictFile); - } // foreignIso - final String dictFile = OUTPUTS + "DE-EN_chemnitz_enwiktionary.quickdic"; + final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.%s.quickdic", OUTPUTS, VERSION_SUFFIX); DictionaryBuilder.main(new String[] { "--dictOut=" + dictFile, "--lang1=DE", @@ -147,21 +142,7 @@ public class DictionaryBuilderMain extends TestCase { "--input3LangCodePattern=de", "--input3EnIndex=2", }); - printToText(dictFile); } - - static void printToText(final String dictFile) throws IOException { - final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r"); - final Dictionary dict = new Dictionary(raf); - final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text")); - final List sorted = new ArrayList(dict.pairEntries); - Collections.sort(sorted); - for (final PairEntry pairEntry : sorted) { - textOut.println(pairEntry.getRawText(false)); - } - textOut.close(); - raf.close(); - } - + } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 32a087f..6f28d30 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -50,7 +50,8 @@ public class IndexBuilder { tokenEntryDatas.clear(); final int indexIndex = index.sortedIndexEntries.size(); final int startRow = rows.size(); - rows.add(new TokenRow(indexIndex, rows.size(), index)); + + rows.add(new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry)); // System.out.println("Added TokenRow: " + rows.get(rows.size() - 1)); int numRows = 0; // System.out.println("TOKEN: " + tokenData.token); @@ -74,15 +75,15 @@ public class IndexBuilder { .normalizer().transliterate(tokenData.token), startRow, numRows)); } - final List entriesSortedByRows = new ArrayList(index.sortedIndexEntries); - Collections.sort(entriesSortedByRows, new Comparator() { + final List entriesSortedByNumRows = new ArrayList(index.sortedIndexEntries); + Collections.sort(entriesSortedByNumRows, new Comparator() { @Override public int compare(IndexEntry object1, IndexEntry object2) { return object2.numRows - object1.numRows; }}); System.out.println("Most common tokens:"); - for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) { - System.out.println(" " + entriesSortedByRows.get(i)); + for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) { + System.out.println(" " + entriesSortedByNumRows.get(i)); } } @@ -90,6 +91,7 @@ public class IndexBuilder { final String token; final Map> typeToEntries = new EnumMap>(EntryTypeName.class); + boolean hasMainEntry = false; TokenData(final String token) { assert token.equals(token.trim()); @@ -110,6 +112,9 @@ public class IndexBuilder { private List getOrCreateEntries(final String token, final EntryTypeName entryTypeName) { final TokenData tokenData = getOrCreateTokenData(token); List entries = tokenData.typeToEntries.get(entryTypeName); + if (entryTypeName.overridesStopList) { + tokenData.hasMainEntry = true; + } if (entries == null) { entries = new ArrayList(); tokenData.typeToEntries.put(entryTypeName, entries); @@ -124,8 +129,9 @@ public class IndexBuilder { } assert indexedEntry != null; for (final String token : tokens) { - if (entryTypeName.overridesStopList || !stoplist.contains(token)) - getOrCreateEntries(token, entryTypeName).add(indexedEntry); + if (entryTypeName.overridesStopList || !stoplist.contains(token)) { + getOrCreateEntries(token, entryTypeName).add(indexedEntry); + } } } diff --git a/src/com/hughes/android/dictionary/engine/LanguageTest.java b/src/com/hughes/android/dictionary/engine/LanguageTest.java index 2d9b6a0..0b7b041 100644 --- a/src/com/hughes/android/dictionary/engine/LanguageTest.java +++ b/src/com/hughes/android/dictionary/engine/LanguageTest.java @@ -26,8 +26,6 @@ import com.ibm.icu.text.Transliterator; public class LanguageTest extends TestCase { public void testGermanSort() { - System.out.println(Language.isoCodeToWikiName.values()); - final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD); assertEquals("aüääss", normalizer.transform("aueAeAEß")); final List words = Arrays.asList( @@ -108,7 +106,7 @@ public class LanguageTest extends TestCase { public void testLanguage() { assertEquals(Language.de, Language.lookup("de")); assertEquals(Language.en, Language.lookup("en")); - assertEquals("es", Language.lookup("es").getSymbol()); + assertEquals("es", Language.lookup("es").getIsoCode()); } public void testTextNorm() { @@ -160,8 +158,8 @@ public class LanguageTest extends TestCase { // These don't seem quite right.... assertEquals("haswb", transliterator.transliterate("حاسوب")); assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر")); - } - + assertEquals("{\u200eكمبيوتر\u200e}", Language.fixBidiText("{كمبيوتر}")); + } } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 2e732f0..c05cbb0 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -32,6 +32,8 @@ import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; + public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-20111224-pages-articles.xml"; @@ -80,7 +82,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } if (selectors.isEmpty()) { - for (final Map.Entry entry : Language.isoCodeToWikiName.entrySet()) { + for (final Map.Entry entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) { selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue())); } } diff --git a/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java new file mode 100644 index 0000000..80f47ed --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/enwiktionary/EnWiktionaryLangs.java @@ -0,0 +1,74 @@ +package com.hughes.android.dictionary.parser.enwiktionary; + +import java.util.LinkedHashMap; +import java.util.Map; + +public class EnWiktionaryLangs { + + public static final Map isoCodeToWikiName = new LinkedHashMap(); + static { + isoCodeToWikiName.put("AF", "Afrikaans"); + isoCodeToWikiName.put("SQ", "Albanian"); + isoCodeToWikiName.put("AR", "Arabic"); + isoCodeToWikiName.put("HY", "Armenian"); + isoCodeToWikiName.put("BE", "Belarusian"); + isoCodeToWikiName.put("BN", "Bengali"); + isoCodeToWikiName.put("BS", "Bosnian"); + isoCodeToWikiName.put("BG", "Bulgarian"); + isoCodeToWikiName.put("CA", "Catalan"); + isoCodeToWikiName.put("HR", "Croatian"); + isoCodeToWikiName.put("CS", "Czech"); + isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese"); + isoCodeToWikiName.put("DA", "Danish"); + isoCodeToWikiName.put("NL", "Dutch"); + isoCodeToWikiName.put("EN", "English"); + isoCodeToWikiName.put("EO", "Esperanto"); + isoCodeToWikiName.put("ET", "Estonian"); + isoCodeToWikiName.put("FI", "Finnish"); + isoCodeToWikiName.put("FR", "French"); + isoCodeToWikiName.put("DE", "German"); + isoCodeToWikiName.put("EL", "Greek"); + isoCodeToWikiName.put("haw", "Hawaiian"); + isoCodeToWikiName.put("HE", "Hebrew"); + isoCodeToWikiName.put("HI", "Hindi"); + isoCodeToWikiName.put("HU", "Hungarian"); + isoCodeToWikiName.put("IS", "Icelandic"); + isoCodeToWikiName.put("ID", "Indonesian"); + isoCodeToWikiName.put("GA", "Irish"); + isoCodeToWikiName.put("IT", "Italian"); + isoCodeToWikiName.put("LA", "Latin"); + isoCodeToWikiName.put("LV", "Latvian"); + isoCodeToWikiName.put("LT", "Lithuanian"); + isoCodeToWikiName.put("JA", "Japanese"); + isoCodeToWikiName.put("KO", "Korean"); + isoCodeToWikiName.put("KU", "Kurdish"); + isoCodeToWikiName.put("MS", "Malay"); + isoCodeToWikiName.put("MI", "Maori"); + isoCodeToWikiName.put("MN", "Mongolian"); + isoCodeToWikiName.put("NE", "Nepali"); + isoCodeToWikiName.put("NO", "Norwegian"); + isoCodeToWikiName.put("FA", "Persian"); + isoCodeToWikiName.put("PL", "Polish"); + isoCodeToWikiName.put("PT", "Portuguese"); + isoCodeToWikiName.put("PA", "Punjabi"); + isoCodeToWikiName.put("RO", "Romanian"); + isoCodeToWikiName.put("RU", "Russian"); + isoCodeToWikiName.put("SA", "Sanskrit"); + isoCodeToWikiName.put("SR", "Serbian"); + isoCodeToWikiName.put("SK", "Slovak"); + isoCodeToWikiName.put("SO", "Somali"); + isoCodeToWikiName.put("ES", "Spanish"); + isoCodeToWikiName.put("SW", "Swahili"); + isoCodeToWikiName.put("SV", "Swedish"); + isoCodeToWikiName.put("TG", "Tajik"); + isoCodeToWikiName.put("TH", "Thai"); + isoCodeToWikiName.put("BO", "Tibetan"); + isoCodeToWikiName.put("TR", "Turkish"); + isoCodeToWikiName.put("UK", "Ukrainian"); + isoCodeToWikiName.put("VI", "Vietnamese"); + isoCodeToWikiName.put("CI", "Welsh"); + isoCodeToWikiName.put("YI", "Yiddish"); + isoCodeToWikiName.put("ZU", "Zulu"); + } + +} diff --git a/todo.txt b/todo.txt index 2016c55..5f1667c 100644 --- a/todo.txt +++ b/todo.txt @@ -1,10 +1,12 @@ For next release: +arabic UI fix "form of" to bottom handle examples like "asdf (asdf)" random word jump multiword find. dictionary update. ???italian verbs +dictionary builder generates text file with list of dictionaries built, sizes, timestamps, token counts, etc. pronunciation synonyms -- 2.43.0