//new Lang("^German$", "DE"),
};
Lang[] langs2 = new Lang[] {
- new Lang("^Italian$", "IT"),
+// new Lang("^.*Greek.*$", "EL"),
+ new Lang("^.*Spanish.*$", "ES"),
+ new Lang("^.*Italian.*$", "IT"),
+ /*
new Lang("^German$", "DE"),
new Lang("^Afrikaans$", "AF"),
new Lang("^Armenian$", "HY"),
new Lang("^English$", "EN"),
new Lang("^Finnish$", "FI"),
new Lang("^French$", "FR"),
- new Lang("^Greek$", "EL"),
new Lang("^Hebrew$", "HE"),
new Lang("^Hindi$", "HI"),
new Lang("^Icelandic$", "IS"),
new Lang("^Sanskrit$", "SA"),
new Lang("^Serbian$", "SR"),
new Lang("^Somali$", "SO"),
- new Lang("^Spanish$", "ES"),
new Lang("^Sudanese$", "SU"),
new Lang("^Swedish$", "SV"),
new Lang("^Tajik$", "TG"),
new Lang("^Vietnamese$", "VI"),
new Lang("^Welsh$", "CY"),
new Lang("^Yiddish$", "YI"),
- new Lang("^Zulu$", "ZU"),
+ new Lang("^Zulu$", "ZU"),*/
};
for (final Lang lang1 : langs1) {
}
int enIndex = -1;
+ Lang nonEnglish = null;
if (lang2.code.equals("EN")) {
enIndex = 2;
+ nonEnglish = lang1;
}
if (lang1.code.equals("EN")) {
enIndex = 1;
+ nonEnglish = lang2;
}
+ assert nonEnglish != null;
final String dictFile = String.format("dictOutputs/%s-%s_enwiktionary.quickdic", lang1.code, lang2.code);
System.out.println("building dictFile: " + dictFile);
String.format("--lang2=%s", lang2.code),
String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.code, lang2.code),
- "--input1=dictInputs/enwiktionary-20110205-pages-articles.xml",
- "--input1Name=enwiktionary",
- "--input1Format=enwiktionary",
- String.format("--input1TranslationPattern1=%s", lang1.nameRegex),
- String.format("--input1TranslationPattern2=%s", lang2.nameRegex),
- String.format("--input1EnIndex=%d", enIndex),
+ "--input3=wikiSplit/english.data",
+ "--input3Name=enwiktionary.english",
+ "--input3Format=enwiktionary",
+ "--input3LangPattern=" + nonEnglish.nameRegex,
+ "--input3LangCodePattern=" + (enIndex == 1 ? lang2.code : lang1.code).toLowerCase(),
+ "--input3EnIndex=" + enIndex,
+
});
// Print the entries for diffing.
sense = null;
} else if (functionName.equals("trans-mid")) {
} else if (functionName.equals("trans-see")) {
+ // TODO
+ } else if (functionName.startsWith("picdic")) {
} else if (functionName.startsWith("checktrans")) {
+ } else if (functionName.startsWith("ttbc")) {
+ wikiTokenizer.nextLine();
+ // TODO: would be great to handle
//TODO: Check this: done = true;
} else {
System.err.println("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
}
String rest = line.substring(colonIndex + 1).trim();
- doTranslationLine(line, title, sense, rest);
+ if (rest.length() > 0) {
+ doTranslationLine(line, title, sense, rest);
+ } else {
+ // TODO: do lines that are like Greek:
+ }
} else if (wikiTokenizer.remainderStartsWith("''See''")) {
wikiTokenizer.nextLine();
// Good chance we'll actually file this one...
final PairEntry pairEntry = new PairEntry();
final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-
+
final StringBuilder otherText = new StringBuilder();
- final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest);
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest, false);
while (wikiTokenizer.nextToken() != null) {
if (wikiTokenizer.isPlainText()) {
final List<String> args = wikiTokenizer.functionPositionArgs();
final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
- if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø")) {
+ if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø") || functionName.equals("apdx-t")) {
if (args.size() < 2) {
System.err.println("{{t}} with too few args: " + line + ", title=" + title);
continue;
}
final String langCode = get(args, 0);
- if (this.langCodePattern.matcher(langCode).matches()) {
+ //if (this.langCodePattern.matcher(langCode).matches()) {
final String word = get(args, 1);
final String gender = get(args, 2);
final String transliteration = namedArgs.get("tr");
otherText.append(String.format(" (tr. %s)", transliteration));
otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
}
- }
+ //}
} else if (functionName.equals("qualifier")) {
String qualifier = args.get(0);
if (!namedArgs.isEmpty() || args.size() > 1) {
} else {
System.err.println("Bad translation token: " + wikiTokenizer.token());
}
-
+ }
+ if (otherText.length() == 0) {
+ System.err.println("Empty otherText: " + line);
+ return;
}
StringBuilder englishText = new StringBuilder();
final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap);
pairEntry.pairs.add(pair);
- assert (pairsAdded.add(pair.toString()));
+ if (!pairsAdded.add(pair.toString())) {
+ System.err.println("Duplicate pair: " + pair.toString());
+ }
if (pair.toString().equals("libero {m} :: free (adjective)")) {
System.out.println();
}
final String mainLine = listLines.get(0);
- final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine);
+ final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false);
while (englishTokenizer.nextToken() != null) {
// TODO handle form of....
if (englishTokenizer.isPlainText()) {