(to distinguish major from minor entries).
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.RandomAccessFile;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import com.hughes.android.dictionary.DictionaryInfo;
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+
+
+public class CheckDictionariesMain {
+
+ public static void main(String[] args) throws IOException {
+ final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
+
+ final PrintWriter dictionaryInfoOut = new PrintWriter(new File("../Dictionary/res/raw/dictionary_info.txt"));
+ dictionaryInfoOut.println("# LANG_1\t%LANG_2\tFILENAME\tFILESIZE\tNUM_MAIN_WORDS_1\tNUM_MAIN_WORDS_2\tNUM_ALL_WORDS_1\tNUM_ALL_WORDS_2");
+
+ final File[] files = dictDir.listFiles();
+ Arrays.sort(files);
+ for (final File dictFile : files) {
+ if (!dictFile.getName().endsWith("quickdic")) {
+ continue;
+ }
+ System.out.println(dictFile.getPath());
+
+ final DictionaryInfo dictionaryInfo = new DictionaryInfo();
+
+ final RandomAccessFile raf = new RandomAccessFile(dictFile, "r");
+ final Dictionary dict = new Dictionary(raf);
+
+ dictionaryInfo.uncompressedFilename = dictFile.getName();
+ dictionaryInfo.uncompressedSize = dictFile.length();
+
+ // Print it.
+ final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
+ final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
+ Collections.sort(sorted);
+ for (final PairEntry pairEntry : sorted) {
+ textOut.println(pairEntry.getRawText(false));
+ }
+ textOut.close();
+
+ // Find the stats.
+ System.out.println("Stats...");
+ for (int i = 0; i < 2; ++i) {
+ dictionaryInfo.langIsos[i] = dict.indices.get(i).sortLanguage.getIsoCode();
+ final Index index = dict.indices.get(i);
+ for (final IndexEntry indexEntry : index.sortedIndexEntries) {
+ final TokenRow tokenRow = (TokenRow) index.rows.get(indexEntry.startRow);
+ dictionaryInfo.allTokenCounts[i]++;
+ if (tokenRow.hasMainEntry) {
+ dictionaryInfo.mainTokenCounts[i]++;
+ }
+ }
+ }
+
+ raf.close();
+
+ dictionaryInfoOut.println(dictionaryInfo.toTabSeparatedString());
+ dictionaryInfoOut.flush();
+ System.out.println(dictionaryInfo.toTabSeparatedString() + "\n");
+ }
+
+ dictionaryInfoOut.close();
+ }
+
+}
public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2, final Set<String> lang1Stoplist, final Set<String> lang2Stoplist) {
dictionary = new Dictionary(dictInfo);
- indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, lang1Stoplist, false));
- indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, lang2Stoplist, true));
+ indexBuilders.add(new IndexBuilder(this, lang0.getIsoCode(), lang0.getIsoCode() + "->" + lang1.getIsoCode(), lang0, normalizerRules1, lang1Stoplist, false));
+ indexBuilders.add(new IndexBuilder(this, lang1.getIsoCode(), lang1.getIsoCode() + "->" + lang0.getIsoCode(), lang1, normalizerRules2, lang2Stoplist, true));
}
void build() {
package com.hughes.android.dictionary.engine;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.RandomAccessFile;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
import java.util.LinkedHashMap;
-import java.util.List;
import java.util.Map;
import junit.framework.TestCase;
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
public class DictionaryBuilderMain extends TestCase {
static final String INPUTS = "data/inputs/";
static final String STOPLISTS = "data/inputs/stoplists/";
- static final String OUTPUTS = "data/outputs/";
-
+ static final String OUTPUTS = "data/outputs/";
+
+ static final String VERSION_SUFFIX = "v002";
+
+
public static void main(final String[] args) throws Exception {
- final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(Language.isoCodeToWikiName);
+ // Builds all the dictionaries it can, outputs list to a text file.
+
+ final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(EnWiktionaryLangs.isoCodeToWikiName);
isoToWikiName.remove("EN");
isoToWikiName.remove("DE");
final Map<String,String> isoToDedication = new LinkedHashMap<String, String>();
isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
- isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja.");
+ isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
// German handled in file.
isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
continue;
}
- final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso);
+ final String dictFile = String.format("%s/EN-%s_enwiktionary.%s.quickdic", OUTPUTS, foreignIso, VERSION_SUFFIX);
System.out.println("building dictFile: " + dictFile);
if (!isoToStoplist.containsKey(foreignIso)) {
});
- // Print the entries for diffing.
- printToText(dictFile);
-
} // foreignIso
- final String dictFile = OUTPUTS + "DE-EN_chemnitz_enwiktionary.quickdic";
+ final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.%s.quickdic", OUTPUTS, VERSION_SUFFIX);
DictionaryBuilder.main(new String[] {
"--dictOut=" + dictFile,
"--lang1=DE",
"--input3LangCodePattern=de",
"--input3EnIndex=2",
});
- printToText(dictFile);
}
-
- static void printToText(final String dictFile) throws IOException {
- final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r");
- final Dictionary dict = new Dictionary(raf);
- final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
- final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
- Collections.sort(sorted);
- for (final PairEntry pairEntry : sorted) {
- textOut.println(pairEntry.getRawText(false));
- }
- textOut.close();
- raf.close();
- }
-
+
}
tokenEntryDatas.clear();
final int indexIndex = index.sortedIndexEntries.size();
final int startRow = rows.size();
- rows.add(new TokenRow(indexIndex, rows.size(), index));
+
+ rows.add(new TokenRow(indexIndex, rows.size(), index, tokenData.hasMainEntry));
// System.out.println("Added TokenRow: " + rows.get(rows.size() - 1));
int numRows = 0;
// System.out.println("TOKEN: " + tokenData.token);
.normalizer().transliterate(tokenData.token), startRow, numRows));
}
- final List<IndexEntry> entriesSortedByRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
- Collections.sort(entriesSortedByRows, new Comparator<IndexEntry>() {
+ final List<IndexEntry> entriesSortedByNumRows = new ArrayList<IndexEntry>(index.sortedIndexEntries);
+ Collections.sort(entriesSortedByNumRows, new Comparator<IndexEntry>() {
@Override
public int compare(IndexEntry object1, IndexEntry object2) {
return object2.numRows - object1.numRows;
}});
System.out.println("Most common tokens:");
- for (int i = 0; i < 50 && i < entriesSortedByRows.size(); ++i) {
- System.out.println(" " + entriesSortedByRows.get(i));
+ for (int i = 0; i < 50 && i < entriesSortedByNumRows.size(); ++i) {
+ System.out.println(" " + entriesSortedByNumRows.get(i));
}
}
final String token;
final Map<EntryTypeName, List<IndexedEntry>> typeToEntries = new EnumMap<EntryTypeName, List<IndexedEntry>>(EntryTypeName.class);
+ boolean hasMainEntry = false;
TokenData(final String token) {
assert token.equals(token.trim());
private List<IndexedEntry> getOrCreateEntries(final String token, final EntryTypeName entryTypeName) {
final TokenData tokenData = getOrCreateTokenData(token);
List<IndexedEntry> entries = tokenData.typeToEntries.get(entryTypeName);
+ if (entryTypeName.overridesStopList) {
+ tokenData.hasMainEntry = true;
+ }
if (entries == null) {
entries = new ArrayList<IndexedEntry>();
tokenData.typeToEntries.put(entryTypeName, entries);
}
assert indexedEntry != null;
for (final String token : tokens) {
- if (entryTypeName.overridesStopList || !stoplist.contains(token))
- getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+ if (entryTypeName.overridesStopList || !stoplist.contains(token)) {
+ getOrCreateEntries(token, entryTypeName).add(indexedEntry);
+ }
}
}
public class LanguageTest extends TestCase {
public void testGermanSort() {
- System.out.println(Language.isoCodeToWikiName.values());
-
final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
assertEquals("aüääss", normalizer.transform("aueAeAEß"));
final List<String> words = Arrays.asList(
public void testLanguage() {
assertEquals(Language.de, Language.lookup("de"));
assertEquals(Language.en, Language.lookup("en"));
- assertEquals("es", Language.lookup("es").getSymbol());
+ assertEquals("es", Language.lookup("es").getIsoCode());
}
public void testTextNorm() {
// These don't seem quite right....
assertEquals("haswb", transliterator.transliterate("حاسوب"));
assertEquals("kmbywtr", transliterator.transliterate("كمبيوتر"));
- }
-
+ assertEquals("{\u200eكمبيوتر\u200e}", Language.fixBidiText("{كمبيوتر}"));
+ }
}
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
+import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs;
+
public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-20111224-pages-articles.xml";
}
if (selectors.isEmpty()) {
- for (final Map.Entry<String, String> entry : Language.isoCodeToWikiName.entrySet()) {
+ for (final Map.Entry<String, String> entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) {
selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue()));
}
}
--- /dev/null
+package com.hughes.android.dictionary.parser.enwiktionary;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class EnWiktionaryLangs {
+
+ public static final Map<String,String> isoCodeToWikiName = new LinkedHashMap<String,String>();
+ static {
+ isoCodeToWikiName.put("AF", "Afrikaans");
+ isoCodeToWikiName.put("SQ", "Albanian");
+ isoCodeToWikiName.put("AR", "Arabic");
+ isoCodeToWikiName.put("HY", "Armenian");
+ isoCodeToWikiName.put("BE", "Belarusian");
+ isoCodeToWikiName.put("BN", "Bengali");
+ isoCodeToWikiName.put("BS", "Bosnian");
+ isoCodeToWikiName.put("BG", "Bulgarian");
+ isoCodeToWikiName.put("CA", "Catalan");
+ isoCodeToWikiName.put("HR", "Croatian");
+ isoCodeToWikiName.put("CS", "Czech");
+ isoCodeToWikiName.put("ZH", "Chinese|Mandarin|Cantonese");
+ isoCodeToWikiName.put("DA", "Danish");
+ isoCodeToWikiName.put("NL", "Dutch");
+ isoCodeToWikiName.put("EN", "English");
+ isoCodeToWikiName.put("EO", "Esperanto");
+ isoCodeToWikiName.put("ET", "Estonian");
+ isoCodeToWikiName.put("FI", "Finnish");
+ isoCodeToWikiName.put("FR", "French");
+ isoCodeToWikiName.put("DE", "German");
+ isoCodeToWikiName.put("EL", "Greek");
+ isoCodeToWikiName.put("haw", "Hawaiian");
+ isoCodeToWikiName.put("HE", "Hebrew");
+ isoCodeToWikiName.put("HI", "Hindi");
+ isoCodeToWikiName.put("HU", "Hungarian");
+ isoCodeToWikiName.put("IS", "Icelandic");
+ isoCodeToWikiName.put("ID", "Indonesian");
+ isoCodeToWikiName.put("GA", "Irish");
+ isoCodeToWikiName.put("IT", "Italian");
+ isoCodeToWikiName.put("LA", "Latin");
+ isoCodeToWikiName.put("LV", "Latvian");
+ isoCodeToWikiName.put("LT", "Lithuanian");
+ isoCodeToWikiName.put("JA", "Japanese");
+ isoCodeToWikiName.put("KO", "Korean");
+ isoCodeToWikiName.put("KU", "Kurdish");
+ isoCodeToWikiName.put("MS", "Malay");
+ isoCodeToWikiName.put("MI", "Maori");
+ isoCodeToWikiName.put("MN", "Mongolian");
+ isoCodeToWikiName.put("NE", "Nepali");
+ isoCodeToWikiName.put("NO", "Norwegian");
+ isoCodeToWikiName.put("FA", "Persian");
+ isoCodeToWikiName.put("PL", "Polish");
+ isoCodeToWikiName.put("PT", "Portuguese");
+ isoCodeToWikiName.put("PA", "Punjabi");
+ isoCodeToWikiName.put("RO", "Romanian");
+ isoCodeToWikiName.put("RU", "Russian");
+ isoCodeToWikiName.put("SA", "Sanskrit");
+ isoCodeToWikiName.put("SR", "Serbian");
+ isoCodeToWikiName.put("SK", "Slovak");
+ isoCodeToWikiName.put("SO", "Somali");
+ isoCodeToWikiName.put("ES", "Spanish");
+ isoCodeToWikiName.put("SW", "Swahili");
+ isoCodeToWikiName.put("SV", "Swedish");
+ isoCodeToWikiName.put("TG", "Tajik");
+ isoCodeToWikiName.put("TH", "Thai");
+ isoCodeToWikiName.put("BO", "Tibetan");
+ isoCodeToWikiName.put("TR", "Turkish");
+ isoCodeToWikiName.put("UK", "Ukrainian");
+ isoCodeToWikiName.put("VI", "Vietnamese");
+ isoCodeToWikiName.put("CI", "Welsh");
+ isoCodeToWikiName.put("YI", "Yiddish");
+ isoCodeToWikiName.put("ZU", "Zulu");
+ }
+
+}
For next release:
+arabic UI fix
"form of" to bottom
handle examples like "asdf (asdf)"
random word jump
multiword find.
dictionary update.
???italian verbs
+dictionary builder generates text file with list of dictionaries built, sizes, timestamps, token counts, etc.
pronunciation
synonyms