package com.hughes.android.dictionary.engine;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.RandomAccessFile;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import junit.framework.TestCase;
+import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
+import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
+
public class DictionaryBuilderMain extends TestCase {
- static final String INPUTS = "../DictionaryData/inputs/";
- static final String STOPLISTS = "../DictionaryData/inputs/stoplists/";
- static final String OUTPUTS = "../DictionaryData/outputs/";
+ static final String INPUTS = "data/inputs/";
+ static final String STOPLISTS = "data/inputs/stoplists/";
+ static final String OUTPUTS = "data/outputs/";
public static void main(final String[] args) throws Exception {
- final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(Language.isoCodeToWikiName);
+ // Builds all the dictionaries it can, outputs list to a text file.
+
+ final Map<String,String> isoToWikiName = new LinkedHashMap<String, String>(WiktionaryLangs.isoCodeToWikiName);
isoToWikiName.remove("EN");
isoToWikiName.remove("DE");
final Map<String,String> isoToDedication = new LinkedHashMap<String, String>();
isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariƫtte Horn.");
- isoToDedication.put("HR", "Croation dictionary dedicated to Ines Viskic and Miro Kresonja.");
+ isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
// German handled in file.
isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
// HACK: The missing "e" prevents a full match, causing "Cantonese" to be appended to the entries.
isoToRegex.put("ZH", "Chinese|Mandarin|Cantones");
+
+ // Build the non EN ones.
+
+ final String[][] nonEnPairs = new String[][] {
+
+ /*
+ {"AR", "DE" },
+ {"AR", "ES" },
+ {"AR", "FR" },
+ {"AR", "HE" },
+ {"AR", "IT" },
+ {"AR", "JA" },
+ {"AR", "RU" },
+ {"AR", "TR" }, // Turkish
+ {"AR", "ZH" },
+
+ {"DE", "AR" },
+ {"DE", "FR" },
+ {"DE", "CA" }, // Catalan
+ {"DE", "CS" }, // Czech
+ {"DE", "EO" }, // Esperanto
+ {"DE", "ES" },
+ {"DE", "FR" },
+ {"DE", "HE" },
+ {"DE", "HU" }, // Hungarian
+ {"DE", "IT" },
+ {"DE", "JA" },
+ {"DE", "LA" }, // Latin
+ {"DE", "PL" }, // Polish
+ {"DE", "RU" },
+ {"DE", "SV" }, // Swedish
+ {"DE", "TR" }, // Turkish
+ {"DE", "ZH" },
+
+
+ {"FR", "BG" }, // Bulgarian
+ {"FR", "CS" }, // Czech
+ {"FR", "DE" },
+ {"FR", "ES" },
+ {"FR", "IT" },
+ {"FR", "JA" },
+ {"FR", "LA" },
+ {"FR", "NL" }, // Dutch
+ {"FR", "RU" },
+ {"FR", "TR" }, // Turkish
+ {"FR", "ZH" },
+
+ {"IT", "DE" },
+ {"IT", "EL" }, // Greek
+ {"IT", "ES" },
+ {"IT", "FR" },
+ {"IT", "HU" },
+ {"IT", "JA" },
+ {"IT", "LA" }, // Latin
+ {"IT", "LV" }, // Latvian
+ {"IT", "NL" },
+ {"IT", "PL" },
+ {"IT", "RU" },
+ {"IT", "SV" },
+ {"IT", "TR" }, // Turkish
+ {"IT", "ZH" },
+
+ {"JA", "ZH" },
+ {"JA", "AR" },
+ {"JA", "KO" },
+
+ {"ZH", "AR" },
+ {"ZH", "DE" },
+ {"ZH", "ES" },
+ {"ZH", "FR" },
+ {"ZH", "IT" },
+ {"ZH", "KO" },
+
+
+ {"NO", "SV" },
+ {"NO", "FI" },
+ {"FI", "SV" },
+
+ {"PL", "FR" }, // Polish
+ {"PL", "RU" }, // Polish
+ {"PL", "HU" }, // Polish
+ {"PL", "ES" }, // Polish
+
+ */
+
+
+ };
+
+ final Set<List<String>> done = new LinkedHashSet<List<String>>();
+ for (final String[] pair : nonEnPairs) {
+ Arrays.sort(pair);
+ final List<String> pairList = Arrays.asList(pair);
+ if (done.contains(pairList)) {
+ continue;
+ }
+ done.add(pairList);
+
+ final String lang1 = pair[0];
+ final String lang2 = pair[1];
+
+ final String dictFile = String.format("%s/%s-%s_enwiktionary_BETA.quickdic",
+ OUTPUTS, lang1, lang2);
+ System.out.println("building dictFile: " + dictFile);
+
+ if (!isoToStoplist.containsKey(lang1)) {
+ isoToStoplist.put(lang1, "empty.txt");
+ }
+ if (!isoToStoplist.containsKey(lang2)) {
+ isoToStoplist.put(lang2, "empty.txt");
+ }
+
+ DictionaryBuilder.main(new String[] {
+ String.format("--dictOut=%s", dictFile),
+ String.format("--lang1=%s", lang1),
+ String.format("--lang2=%s", lang2),
+ String.format("--lang1Stoplist=%s", STOPLISTS + isoToStoplist.get(lang1)),
+ String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(lang2)),
+ String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary.", lang1, lang2),
+
+ String.format("--input2=%swikiSplit/en/EN.data", INPUTS),
+ String.format("--input2Name=BETA!enwiktionary.%s-%s", lang1, lang2),
+ String.format("--input2Format=%s", EnTranslationToTranslationParser.NAME),
+ String.format("--input2LangPattern1=%s", lang1),
+ String.format("--input2LangPattern2=%s", lang2),
+ });
+ }
+ if (1==1) {
+ //return;
+ }
+
+
+ // Now build the EN ones.
+
// isoToWikiName.keySet().retainAll(Arrays.asList("UK", "HR", "FI"));
//isoToWikiName.clear();
- boolean go = true;
+ boolean go = false;
for (final String foreignIso : isoToWikiName.keySet()) {
- if (foreignIso.equals("SV")) {
+ if (foreignIso.equals("SL")) {
go = true;
}
if (!go) {
continue;
}
- final String dictFile = String.format(OUTPUTS + "/EN-%s_enwiktionary.quickdic", foreignIso);
+ final String dictFile = String.format("%s/EN-%s_enwiktionary.quickdic", OUTPUTS, foreignIso);
System.out.println("building dictFile: " + dictFile);
if (!isoToStoplist.containsKey(foreignIso)) {
String.format("--lang2=%s", foreignIso),
String.format("--lang1Stoplist=%s", STOPLISTS + isoToStoplist.get("EN")),
String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)),
- String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary. %s", foreignIso, isoToDedication.get(foreignIso)),
+ String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.\n\n%s", foreignIso, isoToDedication.get(foreignIso)),
- "--input2=" + INPUTS + "enWikiSplit/" + foreignIso + ".data",
+ "--input2=" + INPUTS + "wikiSplit/en/" + foreignIso + ".data",
"--input2Name=enwiktionary." + foreignIso,
"--input2Format=enwiktionary",
+ "--input2WiktionaryType=EnForeign",
"--input2LangPattern=" + isoToRegex.get(foreignIso),
"--input2LangCodePattern=" + foreignIso.toLowerCase(),
"--input2EnIndex=1",
- "--input3=" + INPUTS + "enWikiSplit/EN.data",
+ "--input3=" + INPUTS + "wikiSplit/en/EN.data",
"--input3Name=enwiktionary.english",
"--input3Format=enwiktionary",
+ "--input3WiktionaryType=EnToTranslation",
"--input3LangPattern=" + isoToRegex.get(foreignIso),
"--input3LangCodePattern=" + foreignIso.toLowerCase(),
"--input3EnIndex=1",
});
- // Print the entries for diffing.
- printToText(dictFile);
-
} // foreignIso
- final String dictFile = OUTPUTS + "DE-EN_chemnitz_enwiktionary.quickdic";
+ // Now special case German-English.
+
+ final String dictFile = String.format("%s/DE-EN_chemnitz_enwiktionary.quickdic", OUTPUTS);
DictionaryBuilder.main(new String[] {
"--dictOut=" + dictFile,
"--lang1=DE",
"--input4Charset=UTF8",
"--input4Format=chemnitz",
- "--input2=" + INPUTS + "enWikiSplit/DE.data",
+ "--input2=" + INPUTS + "wikiSplit/en/DE.data",
"--input2Name=enwiktionary.DE",
"--input2Format=enwiktionary",
+ "--input2WiktionaryType=EnForeign",
"--input2LangPattern=German",
"--input2LangCodePattern=de",
"--input2EnIndex=2",
- "--input3=" + INPUTS + "enWikiSplit/EN.data",
+ "--input3=" + INPUTS + "wikiSplit/en/EN.data",
"--input3Name=enwiktionary.english",
"--input3Format=enwiktionary",
+ "--input3WiktionaryType=EnToTranslation",
"--input3LangPattern=German",
"--input3LangCodePattern=de",
"--input3EnIndex=2",
});
- printToText(dictFile);
}
-
- static void printToText(final String dictFile) throws IOException {
- final RandomAccessFile raf = new RandomAccessFile(new File(dictFile), "r");
- final Dictionary dict = new Dictionary(raf);
- final PrintWriter textOut = new PrintWriter(new File(dictFile + ".text"));
- final List<PairEntry> sorted = new ArrayList<PairEntry>(dict.pairEntries);
- Collections.sort(sorted);
- for (final PairEntry pairEntry : sorted) {
- textOut.println(pairEntry.getRawText(false));
- }
- textOut.close();
- raf.close();
- }
-
+
}