-handle examples.
+icons
handle word-info in English.
new Lang("^English$", "EN", null, "en.txt"),
};
Lang[] langs2 = new Lang[] {
- //new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
- new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
- new Lang("^.*Spanish.*$", "ES", "spanish.data", "empty.txt"),
- new Lang("^.*Greek.*$", "EL", "greek.data", "empty.txt"),
- new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"),
- new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"),
+// new Lang("^.*Italian.*$", "IT", "italian.data", "it.txt"),
+// new Lang("^.*French.*$", "FR", "french.data", "empty.txt"),
+// new Lang("^.*Spanish.*$", "ES", "spanish.data", "es.txt"),
+// new Lang("^.*Greek.*$", "EL", "greek.data", "el.txt"),
+// new Lang("^.*Japanese.*$", "JA", "japanese.data", "empty.txt"),
+// new Lang("^.*Chinese.*$|^.*Mandarin.*$", "ZH", "mandarin.data", "empty.txt"),
+ new Lang("^.*Afrikaans.*$", "AF", "afrikaans.data", "empty.txt"),
+ new Lang("^.*Arabic.*$", "AR", "".data, "empty.txt"),
+ new Lang("^.*Hebrew.*$", "HE"),
+ new Lang("^.*Hindi.*$", "HI"),
+ new Lang("^.*Icelandic.*$", "IS"),
+ new Lang("^.*Irish.*$", "GA"),
+ new Lang("^.*Korean.*$", "KO"),
+ new Lang("^.*Maori.*$", "MI"),
+ new Lang("^.*Norwegian.*$", "NO"),
+ new Lang("^.*Persian.*$", "FA"),
+ new Lang("^.*Portuguese.*$", "PT"),
+ new Lang("^.*Romanian.*$", "RO"),
+ new Lang("^.*Russian.*$", "RU"),
+ new Lang("^.*Sanskrit.*$", "SA"),
+ new Lang("^.*Serbian.*$", "SR"),
+ new Lang("^.*Swedish.*$", "SV"),
+ new Lang("^.*Tajik.*$", "TG"),
+ new Lang("^.*Thai.*$", "TH"),
+ new Lang("^.*Tibetan.*$", "BO"),
+ new Lang("^.*Turkish.*$", "TR"),
+ new Lang("^.*Ukranian.*$", "UK"),
+ new Lang("^.*Vietnamese.*$", "VI"),
+ new Lang("^.*Welsh.*$", "CY"),
+ new Lang("^.*Zulu.*$", "ZU"),
+ new Lang("^.*Croation.*$", "HR"),
+ new Lang("^.*Czech.*$", "CS"),
+ new Lang("^.*Dutch.*$", "NL"),
+ new Lang("^.*Finnish.*$", "FI"),
/*
new Lang("^German$", "DE"),
- new Lang("^Afrikaans$", "AF"),
new Lang("^Armenian$", "HY"),
- new Lang("^Arabic$", "AR"),
- new Lang("^Croation$", "HR"),
- new Lang("^Czech$", "CS"),
- new Lang("^Dutch$", "NL"),
new Lang("^English$", "EN"),
- new Lang("^Finnish$", "FI"),
- new Lang("^Hebrew$", "HE"),
- new Lang("^Hindi$", "HI"),
- new Lang("^Icelandic$", "IS"),
- new Lang("^Irish$", "GA"),
- new Lang("^Korean$", "KO"),
new Lang("^Kurdish$", "KU"),
new Lang("^Lithuanian$", "LT"),
new Lang("^Malay$", "MS"),
- new Lang("^Maori$", "MI"),
new Lang("^Mongolian$", "MN"),
- new Lang("^Norwegian$", "NO"),
- new Lang("^Persian$", "FA"),
- new Lang("^Portuguese$", "PT"),
- new Lang("^Romanian$", "RO"),
- new Lang("^Russian$", "RU"),
- new Lang("^Sanskrit$", "SA"),
- new Lang("^Serbian$", "SR"),
new Lang("^Somali$", "SO"),
new Lang("^Sudanese$", "SU"),
- new Lang("^Swedish$", "SV"),
- new Lang("^Tajik$", "TG"),
- new Lang("^Thai$", "TH"),
- new Lang("^Tibetan$", "BO"),
- new Lang("^Turkish$", "TR"),
- new Lang("^Ukranian$", "UK"),
- new Lang("^Vietnamese$", "VI"),
- new Lang("^Welsh$", "CY"),
new Lang("^Yiddish$", "YI"),
- new Lang("^Zulu$", "ZU"),*/
+ */
};
for (final Lang lang1 : langs1) {
} // langs1
DictionaryBuilder.main(new String[] {
- "--dictOut=" + OUTPUTS + "DE-EN_chemnitz.quickdic",
+ "--dictOut=" + OUTPUTS + "DE-EN_all_free.quickdic",
"--lang1=DE",
"--lang2=EN",
- "--dictInfo=@" + INPUTS + "de-en_chemnitz.info",
+ "--dictInfo=@" + INPUTS + "de-en_all_free.info",
"--input1=" + INPUTS + "de-en_chemnitz.txt",
"--input1Name=chemnitz",
"--input3Name=dictcc",
"--input3Charset=UTF8",
"--input3Format=dictcc",
-
- // TODO: wiktionary
});
}
if (selectors.isEmpty()) {
selectors.addAll(Arrays.asList(
- new Selector("../DictionaryData/inputs/enWikiSplit/arabic.data", ".*[Ar]rabic.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/croation.data", ".*[Cc]roation.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/czech.data", ".*[Cc]zech.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/dutch.data", ".*[Du]utch.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/english.data", ".*[Ee]nglish.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/french.data", ".*[Ff]rench.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/german.data", ".*[Gg]erman.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/greek.data", ".*[Gg]reek.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/hindi.data", ".*[Hh]indi.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/italian.data", ".*[Ii]talian.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/japanese.data", ".*[Jj]apanese.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/korean.data", ".*[Kk]orean.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/persian.data", ".*[Pp]ersian.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/portuguese.data", ".*[Pp]ortuguese.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/romanian.data", ".*[Rr]omanian.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/russian.data", ".*[Rr]ussian.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/spanish.data", ".*[Ss]panish.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/swedish.data", ".*[Ss]wedish.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/thai.data", ".*[Tt]hai.*"),
- new Selector("../DictionaryData/inputs/enWikiSplit/vietnamese.data", ".*[Vv]ietnamese.*")
+ new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roation.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Mm]andarin|[Cc]hinese.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Du]utch.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]inish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ewbrew.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/SU.data", ".*[Ss]udanese.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]kranian.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"),
+ new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*")
));
}
static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName());
- // TODO: look for {{ and [[ and <adf> <!-- in output.
// TODO: process {{ttbc}} lines
static final Pattern partOfSpeechHeader = Pattern.compile(
sense = null;
} else if (functionName.equals("trans-mid")) {
} else if (functionName.equals("trans-see")) {
- // TODO
+ // TODO: would also be nice...
} else if (functionName.startsWith("picdic")) {
} else if (functionName.startsWith("checktrans")) {
} else if (functionName.startsWith("ttbc")) {
wikiTokenizer.nextLine();
- // TODO: would be great to handle
- //TODO: Check this: done = true;
+ // TODO: would be great to handle ttbc
+ // TODO: Check this: done = true;
} else {
LOG.warning("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
}
if (rest.length() > 0) {
doTranslationLine(line, title, pos, sense, rest);
} else {
- // TODO: do lines that are like Greek:
+ // TODO: do lines that are like "Greek:"
}
} else if (wikiTokenizer.remainderStartsWith("''See''")) {
}
if (currentHeadingDepth > posDepth) {
- // TODO
+ // TODO: deal with other neat info sections
continue;
}
} finally {
// Here's where we exit.
- // TODO: Should we make an entry even if there are no foreign list items?
+ // Should we make an entry even if there are no foreign list items?
String foreign = foreignBuilder.toString().trim();
if (!foreign.toLowerCase().startsWith(title.toLowerCase())) {
foreign = title + " " + foreign;
mdashLen = 3;
}
- // TODO: index and clean these!!!
- if (nextPrefix.equals("#:") && dash != -1) {
+ if ((nextPrefix.equals("#:") || nextPrefix.equals("##:")) && dash != -1) {
final String foreignEx = nextLine.substring(0, dash);
final String englishEx = nextLine.substring(dash + mdashLen);
- final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder), formatAndIndexExampleString(foreignEx, otherIndexBuilder), swap);
- pairEntry.pairs.add(pair);
+ final Pair pair = new Pair(formatAndIndexExampleString(englishEx, enIndexBuilder, indexedEntry), formatAndIndexExampleString(foreignEx, otherIndexBuilder, indexedEntry), swap);
+ if (pair.lang1 != "--" && pair.lang1 != "--") {
+ pairEntry.pairs.add(pair);
+ }
lastForeign = null;
- } else if (nextPrefix.equals("#:")){
- final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
+ } else if (nextPrefix.equals("#:") || nextPrefix.equals("##:")){
+ final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
lastForeign = nextLine;
- pairEntry.pairs.add(pair);
+ if (pair.lang1 != "--" && pair.lang1 != "--") {
+ pairEntry.pairs.add(pair);
+ }
} else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) {
if (lastForeign != null) {
pairEntry.pairs.remove(pairEntry.pairs.size() - 1);
- final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder), formatAndIndexExampleString(lastForeign, otherIndexBuilder), swap);
- pairEntry.pairs.add(pair);
+ final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap);
+ if (pair.lang1 != "--" && pair.lang1 != "--") {
+ pairEntry.pairs.add(pair);
+ }
} else {
LOG.warning("English example with no foreign: " + title + ", " + nextLine);
}
} else if (nextPrefix.equals("#*")) {
// Can't really index these.
- final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
+ final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
lastForeign = nextLine;
- pairEntry.pairs.add(pair);
- } else if (nextPrefix.equals("#::*")) {
- final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null), swap);
- pairEntry.pairs.add(pair);
- } else {
- assert false;
+ if (pair.lang1 != "--" && pair.lang1 != "--") {
+ pairEntry.pairs.add(pair);
+ }
+ } else if (nextPrefix.equals("#::*") || nextPrefix.equals("##") || nextPrefix.equals("#*:") || nextPrefix.equals("#:*") || true) {
+ final Pair pair = new Pair("--", formatAndIndexExampleString(nextLine, null, indexedEntry), swap);
+ if (pair.lang1 != "--" && pair.lang1 != "--") {
+ pairEntry.pairs.add(pair);
+ }
+// } else {
+// assert false;
}
}
}
- private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder) {
+ private String formatAndIndexExampleString(final String example, final IndexBuilder indexBuilder, final IndexedEntry indexedEntry) {
final WikiTokenizer wikiTokenizer = new WikiTokenizer(example, false);
final StringBuilder builder = new StringBuilder();
boolean insideTripleQuotes = false;
while (wikiTokenizer.nextToken() != null) {
if (wikiTokenizer.isPlainText()) {
builder.append(wikiTokenizer.token());
-
+ if (indexBuilder != null) {
+ indexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.token(), EntryTypeName.WIKTIONARY_EXAMPLE);
+ }
} else if (wikiTokenizer.isWikiLink()) {
- builder.append(wikiTokenizer.wikiLinkText());
-
+ final String text = wikiTokenizer.wikiLinkText().replaceAll("'", "");
+ builder.append(text);
+ if (indexBuilder != null) {
+ indexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_EXAMPLE);
+ }
} else if (wikiTokenizer.isFunction()) {
builder.append(wikiTokenizer.token());
} else if (wikiTokenizer.isMarkup()) {
LOG.warning("unexpected token: " + wikiTokenizer.token());
}
}
- return trim(builder.toString());
+ final String result = trim(builder.toString());
+ return result.length() > 0 ? result : "--";
}