Source: http://dict.tu-chemnitz.de/
Thanks to Frank Richter.
And from:
-(EN)Wiktionary
+Wiktionary
import com.hughes.android.dictionary.DictionaryInfo;
import com.hughes.android.dictionary.DictionaryInfo.IndexInfo;
-import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
import com.hughes.util.CollectionUtil;
-import com.hughes.util.StringUtil;
import java.io.File;
import java.io.IOException;
public class CheckDictionariesMain {
static final String BASE_URL = "http://quickdic-dictionary.googlecode.com/files/";
- static final String VERSION_CODE = "v005";
+ static final String VERSION_CODE = "v006";
public static void main(String[] args) throws IOException {
final File dictDir = new File(DictionaryBuilderMain.OUTPUTS);
// Build the non EN ones.
static final String[][] nonEnPairs = new String[][] {
- /*
{"EN"},
{"DE"},
- {"IT"}, */
- // This one takes a really long time:
- // {"FR"},
+ {"IT"},
+ // This one takes a really long time, and the result is too big for code.google.com
+ //{"FR"},
// The 3 I use most:
{"IT", "EN" },
{"FA", "HY" }, // Persian, Armenian, by request.
{"FA", "SV" }, // Persian, Swedish, by request.
{"NL", "PL" }, // Dutch, Polish, by request.
+
};
static final Map<String,String> isoToDedication = new LinkedHashMap<String, String>();
static {
- isoToDedication.put("AF", "Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
- isoToDedication.put("HR", "Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
- isoToDedication.put("NL", "Dutch dictionary dedicated to Mike LeBeau.");
- // German handled in file.
- isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge.");
- isoToDedication.put("IT", "Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!");
- isoToDedication.put("KO", "Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!");
- isoToDedication.put("PT", "Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder.");
- isoToDedication.put("RO", "Romanian dictionary dedicated to Radu Teodorescu.");
- isoToDedication.put("RU", "Russian dictionary dedicated to Maxim Aronin--best friend always!.");
- isoToDedication.put("SR", "Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey.");
- isoToDedication.put("ES", "Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!");
- isoToDedication.put("SV", "Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!");
+ isoToDedication.put("AF", "Wiktionary-based Afrikaans dictionary dedicated to Heiko and Mariëtte Horn.");
+ isoToDedication.put("HR", "Wiktionary-based Croatian dictionary dedicated to Ines Viskic and Miro Kresonja.");
+ isoToDedication.put("NL", "Wiktionary-based Dutch dictionary dedicated to Mike LeBeau.");
+ isoToDedication.put("DE", "@data/inputs/de-en_dedication.txt");
+ isoToDedication.put("EL", "Wiktionary-based Greek dictionary dedicated to Noah Egge.");
+ isoToDedication.put("IT", "Wiktionary-based Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!");
+ isoToDedication.put("KO", "Wiktionary-based Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!");
+ isoToDedication.put("PT", "Wiktionary-based Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder.");
+ isoToDedication.put("RO", "Wiktionary-based Romanian dictionary dedicated to Radu Teodorescu.");
+ isoToDedication.put("RU", "Wiktionary-based Russian dictionary dedicated to Maxim Aronin--best friend always!.");
+ isoToDedication.put("SR", "Wiktionary-based Serbian dictionary dedicated to Filip Crnogorac--thanks for the honey.");
+ isoToDedication.put("ES", "Wiktionary-based Spanish dictionary made especially for Carolina Tropini! <3 XoXoXXXXX!");
+ isoToDedication.put("SV", "Wiktionary-based Swedish dictionary dedicated to Kajsa Palmblad--björn kramar!");
}
- private static String getDedication(String iso) {
- return isoToDedication.containsKey(iso) ? "\n\n" + isoToDedication.get(iso) : "";
+ private static String getEnDictionaryInfo(String iso) {
+ return isoToDedication.containsKey(iso) ? isoToDedication.get(iso) : String.format("Wiktionary-based %s dictionary.", iso);
}
static final Map<String,String> isoToStoplist = new LinkedHashMap<String, String>();
result.add(String.format("--lang1=%s", lang1));
result.add(String.format("--lang2=%s", lang2));
- result.add(String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.%s", foreignIso, getDedication(foreignIso)));
+ result.add(String.format("--dictInfo=%s", getEnDictionaryInfo(foreignIso)));
// Foreign section.
result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso));
final Set<List<String>> done = new LinkedHashSet<List<String>>();
+ boolean go = true;
for (final String[] pair : allPairs) {
Arrays.sort(pair);
final List<String> pairList = Arrays.asList(pair);
}
done.add(pairList);
- if (!pairList.contains("EN") && !pairList.contains("EL")) {
- //continue;
+// if (pairList.contains("EN") && pairList.contains("DE")) {
+// go = true;
+// } else {
+// go = false;
+// }
+
+ if (!go) {
+ continue;
}
DictionaryBuilder.main(getMainArgs(pair).toArray(new String[0]));
callback.onPlainText(tokenizer.token());
} else if (tokenizer.isMarkup()) {
callback.onMarkup(tokenizer);
- } else if (tokenizer.isWikiLink) {
+ } else if (tokenizer.isWikiLink()) {
callback.onWikiLink(tokenizer);
} else if (tokenizer.isNewline()) {
callback.onNewline(tokenizer);
if (firstUnescapedPipePos != -1) {
return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
}
- return trimNewlines(wikiText.substring(start + 2, end - 2).trim());
+ final int safeEnd = Math.max(start + 2, end - 2);
+ return trimNewlines(wikiText.substring(start + 2, safeEnd).trim());
}
public List<String> functionPositionArgs() {
import junit.framework.TestCase;
public class WikiTokenizerTest extends TestCase {
-
+
public void testWikiLink() {
String wikiText;
public void testFunction() {
String wikiText;
-
+
+ {
+ WikiTokenizer wt = new WikiTokenizer("'''Προστατευόμενη Ονομασία Προέλευσης''', \"Protected Designation of Origin\" {{");
+ while (wt.nextToken() != null) {
+ if (wt.isFunction()) {
+ assertEquals("", wt.functionName());
+ }
+ }
+ }
+
wikiText = "{{abc}}";
assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
} else if (wikiTokenizer.isPlainText()) {
// Unindexed!
foreignBuilder.append(wikiTokenizer.token());
-
- } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) {
+ } else if (wikiTokenizer.isHtml()) {
+ if (!wikiTokenizer.token().startsWith("<ref>")) {
+ foreignBuilder.append(wikiTokenizer.token());
+ }
+ } else if (wikiTokenizer.isMarkup() ||
+ wikiTokenizer.isNewline() ||
+ wikiTokenizer.isComment()) {
// Do nothing.
} else {
LOG.warning("Unexpected token: " + wikiTokenizer.token());
final Map<String, String> namedArgs,
final T parser,
final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
+ namedArgs.remove("lang");
if (!namedArgs.isEmpty()) {
EnParser.LOG.warning("weird qualifier: " + wikiTokenizer.token());
return false;
isoCodeToEnWikiName.put("BE", "Belarusian");
isoCodeToEnWikiName.put("BN", "Bengali");
isoCodeToEnWikiName.put("BG", "Bulgarian");
- isoCodeToEnWikiName.put("MY", "Burmese");
isoCodeToEnWikiName.put("CA", "Catalan");
isoCodeToEnWikiName.put("SH", "Serbo-Croatian");
isoCodeToEnWikiName.put("HR", "Croatian");
// No longer exists in EN:
// isoCodeToEnWikiName.put("BS", "Bosnian");
// isoCodeToEnWikiName.put("SR", "Serbian");
+
+ // Font doesn't work:
+ //isoCodeToEnWikiName.put("MY", "Burmese");
+
{
Set<String> missing = new LinkedHashSet<String>(isoCodeToEnWikiName.keySet());
+Handle wiki tables {| .. |-| .. |}
de-conj
why does presso not show up?
-Afferrare in it, italics don't end.
{{L
-start new intent for web link.
-
-
{{term
{{etyl
{{l
{{de-conj
-Spaces in links are done wrong: "perche mai",click "why on earth", see "why%20..."
Delete it conjugation of entries.
-
Compression for PairEntries!
delete these entries:
# {{conjugation of|abalienare||2|p|pres|ind|lang=it}}
# {{conjugation of|abalienare||2|p|imp|lang=it}}
# {{form of|[[feminine|Feminine]] plural|abalienato}}
-
-HtmlEntry
- - text inside functions doesn't get escaped properly.
- - Skips Uebersetzung section (likewise in other langs), except maybe for the other lange of interest.
- - Build single EN/DE/IT/FR dictionaries based on HtmlEntry.
- - Parse Italian verb forms from enwiktionary into something useful.
- - "See also" link entries for cross-referencing ("form of"--strong, links to token, "mentioned in"--weaker, links to HtmlEntry).
- Nice:
- - Add links into the HtmlEntry based on wikilinks.
- - Link to them from the appropriate places: IndexEntry (first), and individual rows (tricker, built at different times).
-
Hitory of lookups.
make sure word is sticky when you change dictionaries.