package com.hughes.android.dictionary.engine;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.io.RandomAccessFile;
public static final String GOLDENS = "../DictionaryData/testdata/goldens/";
public static final String TEST_OUTPUTS = "../DictionaryData/testdata/outputs/";
- public static final String OUTPUTS = "../DictionaryData/outputs/";
- public void testWiktionaryItalian() throws Exception {
- final File result = new File(TEST_OUTPUTS + "wiktionary.it.quickdic");
+ public void testWiktionaryItalianFromItalian() throws Exception {
+ final String name = "wiktionary.it_it.quickdic";
+ final File result = new File(TEST_OUTPUTS + name);
System.out.println("Writing to: " + result);
DictionaryBuilder.main(new String[] {
"--dictOut=" + result.getAbsolutePath(),
"--lang2=EN",
"--dictInfo=SomeWikiData",
- /*
- "--input3=" + WIKISPLIT + "english.data",
- "--input3Name=enwiktionary.english",
- "--input3Format=enwiktionary",
- "--input3LangPattern=Italian",
- "--input3LangCodePattern=it",
- "--input3EnIndex=2",
- "--input3PageLimit=1000",
-*/
"--input4=" + WIKISPLIT + "italian.data",
"--input4Name=enwiktionary.italian",
"--input4Format=enwiktionary",
"--input4EnIndex=2",
"--input4PageLimit=1000",
- "--print=" + result.getName() + ".text",
+ "--print=" + result.getPath() + ".text",
});
- // Check it once:
- assertFilesEqual(GOLDENS + "wiktionary.it_it.quickdic.text", result.getName() + ".text");
-
- // Check it again.
- final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
- final PrintStream out = new PrintStream(new File(result.getName() + ".text"));
- dict.print(out);
- out.close();
+ checkGolden(name, result);
+ }
+
+ public void testWiktionaryItalianFromEnglish() throws Exception {
+ final String name = "wiktionary.it_en.quickdic";
+ final File result = new File(TEST_OUTPUTS + name);
+ System.out.println("Writing to: " + result);
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=" + result.getAbsolutePath(),
+ "--lang1=IT",
+ "--lang2=EN",
+ "--dictInfo=SomeWikiData",
+
+ "--input3=" + WIKISPLIT + "english.data",
+ "--input3Name=enwiktionary.english",
+ "--input3Format=enwiktionary",
+ "--input3LangPattern=Italian",
+ "--input3LangCodePattern=it",
+ "--input3EnIndex=2",
+ "--input3PageLimit=1000",
+
+ "--print=" + result.getPath() + ".text",
+ });
- assertFilesEqual(GOLDENS + "wiktionary.it_it.quickdic.text", result.getName() + ".text");
+ checkGolden(name, result);
}
public void testGermanCombined() throws Exception {
- final File result = new File(TEST_OUTPUTS + "de-en.quickdic");
+ final String name = "de-en.quickdic";
+ final File result = new File(TEST_OUTPUTS + name);
System.out.println("Writing to: " + result);
DictionaryBuilder.main(new String[] {
"--dictOut=" + result.getAbsolutePath(),
"--input2Charset=UTF8",
"--input2Format=dictcc",
- "--print=" + result.getName() + ".text",
+ "--print=" + result.getPath() + ".text",
});
+ checkGolden(name, result);
+ }
+
+ private void checkGolden(final String dictName, final File dictFile)
+ throws IOException, FileNotFoundException {
// Check it once:
- assertFilesEqual(GOLDENS + "de-en.quickdic.text", result.getName() + ".text");
-
+ assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
+
// Check it again.
- final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
- final PrintStream out = new PrintStream(result.getName() + ".text");
+ final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
+ final PrintStream out = new PrintStream(new File(dictFile.getName() + ".text"));
dict.print(out);
out.close();
-
- assertFilesEqual(GOLDENS + "de-en.quickdic.text", result.getName() + ".text");
+ assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
}
-
void assertFilesEqual(final String expected, final String actual) throws IOException {
final String expectedString = FileUtil.readToString(new File(expected));
final String actualString = FileUtil.readToString(new File(actual));
"yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
private void doTranslations(final String title, final WikiTokenizer wikiTokenizer) {
+ if (title.equals("absolutely")) {
+ System.out.println();
+ }
+
String sense = null;
boolean done = false;
while (wikiTokenizer.nextToken() != null) {
if (!namedArgs.isEmpty() || args.size() > 1) {
LOG.warning("weird qualifier: " + line);
}
+ // Unindexed!
otherText.append("(").append(qualifier).append(")");
} else if (encodings.contains(functionName)) {
otherText.append("").append(args.get(0));
otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
- } else if (functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p")) {
- otherText.append("{");
- otherText.append(functionName);
- for (int i = 0; i < args.size(); ++i) {
- otherText.append("|").append(args.get(i));
- }
- otherText.append("}");
+ } else if (isGender(functionName)) {
+ appendGender(otherText, functionName, args);
} else if (functionName.equals("g")) {
otherText.append("{g}");
} else if (functionName.equals("l")) {
otherText.append("[").append(args.get(0)).append("]");
otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
} else if (functionName.equals("ttbc")) {
+ LOG.warning("Unexpected {{ttbc}}");
} else if (functionName.equals("trreq")) {
} else if (functionName.equals("not used")) {
otherText.append("(not used)");
} else if (functionName.equals("t-image")) {
// American sign language
- } else if (args.isEmpty() && namedArgs.isEmpty()) {
- otherText.append("{UNK. FUNC.: ").append(functionName).append("}");
} else {
- LOG.warning("Unexpected t+- wikifunction: " + line + ", title=" + title);
+ // Unindexed!
+ otherText.append(wikiTokenizer.token());
}
} else if (wikiTokenizer.isNewline()) {
}
}
-
- static final Pattern whitespace = Pattern.compile("\\s+");
- static String trim(final String s) {
- return whitespace.matcher(s).replaceAll(" ").trim();
+
+ private void appendGender(final StringBuilder otherText,
+ final String functionName, final List<String> args) {
+ otherText.append("{");
+ otherText.append(functionName);
+ for (int i = 0; i < args.size(); ++i) {
+ otherText.append("|").append(args.get(i));
+ }
+ otherText.append("}");
+ }
+
+
+ private boolean isGender(final String functionName) {
+ return functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p");
}
Set<String> pairsAdded = new LinkedHashSet<String>();
} else if (headingName.equals("Pronunciation")) {
//doPronunciation(wikiLineReader);
} else if (partOfSpeechHeader.matcher(headingName).matches()) {
- doPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
+ doForeignPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
}
} else {
}
}
}
+
+ static final class ListSection {
+ final String firstPrefix;
+ final String firstLine;
+ final List<String> nextPrefixes = new ArrayList<String>();
+ final List<String> nextLines = new ArrayList<String>();
+
+ public ListSection(String firstPrefix, String firstLine) {
+ this.firstPrefix = firstPrefix;
+ this.firstLine = firstLine;
+ }
+ @Override
+ public String toString() {
+ return firstPrefix + firstLine + "{ " + nextPrefixes + "}";
+ }
+ }
- private void doPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) {
+
+ private void doForeignPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) {
LOG.info("***" + title + ", pos=" + posHeading);
- //final StringBuilder foreignBuilder = new StringBuilder();
+ if (title.equals("moro")) {
+ System.out.println();
+ }
+
+ final StringBuilder foreignBuilder = new StringBuilder();
+ Collection<String> wordForms = Collections.emptyList();
+ final List<ListSection> listSections = new ArrayList<ListSection>();
+
+ try {
- String side = null;
- Collection<String> forms = Collections.emptyList();
+ ListSection lastListSection = null;
int currentHeadingDepth = posDepth;
while (wikiTokenizer.nextToken() != null) {
// I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!)
// for the conjugation table from "fa".
// Would like to be able to link to a lang#token.
- if (name.equals("it-noun")) {
- assert forms.isEmpty();
+ if (isGender(name)) {
+ appendGender(foreignBuilder, name, args);
+ } else if (name.equals("wikipedia")) {
+ namedArgs.remove("lang");
+ if (args.size() > 1 || !namedArgs.isEmpty()) {
+ // Unindexed!
+ foreignBuilder.append(wikiTokenizer.token());
+ } else if (args.size() == 1) {
+ foreignBuilder.append(wikiTokenizer.token());
+ } else {
+ //foreignBuilder.append(title);
+ }
+ } else if (name.equals("it-noun")) {
+ assert wordForms.isEmpty();
final String base = get(args, 0);
final String gender = get(args, 1);
final String singular = base + get(args, 2);
final String plural = base + get(args, 3);
- side = String.format("%s {%s}, %s {pl}", singular, gender, plural, plural);
- forms = Arrays.asList(singular, plural);
+ foreignBuilder.append(String.format("%s {%s}, %s {pl}", singular, gender, plural, plural));
+ wordForms = Arrays.asList(singular, plural);
} else if (name.equals("it-proper noun")) {
- // TODO
+ foreignBuilder.append(wikiTokenizer.token());
} else if (name.equals("it-adj")) {
- // TODO
+ foreignBuilder.append(wikiTokenizer.token());
} else if (name.startsWith("it-conj")) {
if (name.equals("it-conj-are")) {
itConjAre(args, namedArgs);
} else {
LOG.warning("Unknown conjugation: " + wikiTokenizer.token());
}
-
} else {
- LOG.warning("Unknown function: " + wikiTokenizer.token());
+ // Unindexed!
+ foreignBuilder.append(wikiTokenizer.token());
+ // LOG.warning("Unknown function: " + wikiTokenizer.token());
}
} else if (wikiTokenizer.isListItem()) {
- handleForeignListItem(side != null ? side : title, title, forms, wikiTokenizer);
-
+ final String prefix = wikiTokenizer.listItemPrefix();
+ if (lastListSection != null &&
+ prefix.startsWith(lastListSection.firstPrefix) &&
+ prefix.length() > lastListSection.firstPrefix.length()) {
+ lastListSection.nextPrefixes.add(prefix);
+ lastListSection.nextLines.add(wikiTokenizer.listItemWikiText());
+ } else {
+ lastListSection = new ListSection(prefix, wikiTokenizer.listItemWikiText());
+ listSections.add(lastListSection);
+ }
+ } else if (lastListSection != null) {
+ // Don't append anything after the lists, because there's crap.
} else if (wikiTokenizer.isWikiLink()) {
+ // Unindexed!
+ foreignBuilder.append(wikiTokenizer.wikiLinkText());
+
+ } else if (wikiTokenizer.isPlainText()) {
+ // Unindexed!
+ foreignBuilder.append(wikiTokenizer.token());
+
+ } else if (wikiTokenizer.isMarkup() || wikiTokenizer.isNewline() || wikiTokenizer.isComment()) {
+ // Do nothing.
+ } else {
+ LOG.warning("Unexpected token: " + wikiTokenizer.token());
+ }
+ }
+
+ } finally {
+ // Here's where we exit.
+ // TODO: Should we make an entry even if there are no foreign list items?
+ if (foreignBuilder.indexOf(title) == -1) {
+ foreignBuilder.insert(0, title + " ");
+ }
+ for (final ListSection listSection : listSections) {
+ doForeignListItem(foreignBuilder.toString(), title, wordForms, listSection);
+ }
+ }
+ }
+
+
+ static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile(
+ "(first|second|third)-person (singular|plural)|" +
+ "present tense|" +
+ "imperative"
+ );
+
+ private void doForeignListItem(final String foreignText, String title, final Collection<String> forms, final ListSection listSection) {
+
+ final String prefix = listSection.firstPrefix;
+ if (prefix.length() > 1) {
+ LOG.warning("Prefix too long: " + listSection);
+ return;
+ }
+
+ final PairEntry pairEntry = new PairEntry();
+ final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+
+ final StringBuilder englishBuilder = new StringBuilder();
+
+ final String mainLine = listSection.firstLine;
+
+ final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false);
+ while (englishTokenizer.nextToken() != null) {
+ // TODO handle form of....
+ if (englishTokenizer.isPlainText()) {
+ englishBuilder.append(englishTokenizer.token());
+ enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF);
+ } else if (englishTokenizer.isWikiLink()) {
+ final String text = englishTokenizer.wikiLinkText();
+ final String link = englishTokenizer.wikiLinkDest();
+ if (link != null) {
+ if (link.contains("#English")) {
+ englishBuilder.append(text);
+ enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+ } else if (link.contains("#") && this.langPattern.matcher(link).find()) {
+ englishBuilder.append(text);
+ otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
+ } else {
+ LOG.warning("Special link: " + englishTokenizer.token());
+ // TODO: something here...
+ }
+ } else {
+ // link == null
+ englishBuilder.append(text);
+ if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) {
+ enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+ }
+ }
+ } else if (englishTokenizer.isFunction()) {
+ final String name = englishTokenizer.functionName();
+ if (name.contains(" conjugation of ") ||
+ name.contains(" form of ") ||
+ name.contains(" feminine of ") ||
+ name.contains(" plural of ")) {
+ // Ignore these in the index, they're really annoying....
+ englishBuilder.append(englishTokenizer.token());
+ } else {
+ LOG.warning("Unexpected function: " + englishTokenizer.token());
+ }
} else {
+ if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) {
+ } else {
+ LOG.warning("Unexpected definition text: " + englishTokenizer.token());
+ }
+ }
+ }
+ final String english = trim(englishBuilder.toString());
+ if (english.length() > 0) {
+ final Pair pair = new Pair(english, trim(foreignText), this.swap);
+ pairEntry.pairs.add(pair);
+ otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ for (final String form : forms) {
+ otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI);
}
-
}
}
+
private void itConjAre(List<String> args, Map<String, String> namedArgs) {
final String base = args.get(0);
final String aux = args.get(1);
}
}
- final List<String> listPrefixes = new ArrayList<String>();
- final List<String> listLines = new ArrayList<String>();
-
-static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile(
- "(first|second|third)-person (singular|plural)|" +
- "present tense|" +
- "imperative"
- );
-
- private void handleForeignListItem(final String foreignText, String title, final Collection<String> forms, final WikiTokenizer wikiTokenizer) {
-
- final String prefix = wikiTokenizer.listItemPrefix();
- if (prefix.length() > 1) {
- LOG.warning("Prefix too long: " + wikiTokenizer.token());
- return;
- }
-
- listPrefixes.clear();
- listLines.clear();
- listPrefixes.add(prefix);
- listLines.add(wikiTokenizer.listItemWikiText());
- while(wikiTokenizer.nextToken() != null &&
- wikiTokenizer.isNewline() ||
- wikiTokenizer.isComment() ||
- (wikiTokenizer.isListItem() &&
- wikiTokenizer.listItemPrefix().length() > prefix.length() &&
- wikiTokenizer.listItemPrefix().startsWith(prefix))) {
- if (wikiTokenizer.isListItem()) {
- listPrefixes.add(wikiTokenizer.listItemPrefix());
- listLines.add(wikiTokenizer.listItemWikiText());
- }
- }
- if (wikiTokenizer.nextToken() != null) {
- wikiTokenizer.returnToLineStart();
- }
- LOG.info("list lines: " + listLines);
- LOG.info("list prefixes: " + listPrefixes);
-
- final PairEntry pairEntry = new PairEntry();
- final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-
- final String foreign = trim(title);
-
- final StringBuilder englishBuilder = new StringBuilder();
-
- final String mainLine = listLines.get(0);
-
- final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine, false);
- while (englishTokenizer.nextToken() != null) {
- // TODO handle form of....
- if (englishTokenizer.isPlainText()) {
- englishBuilder.append(englishTokenizer.token());
- enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF);
- } else if (englishTokenizer.isWikiLink()) {
- final String text = englishTokenizer.wikiLinkText();
- final String link = englishTokenizer.wikiLinkDest();
- if (link != null) {
- if (link.contains("#English")) {
- englishBuilder.append(text);
- enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
- } else if (link.contains("#") && this.langPattern.matcher(link).find()) {
- englishBuilder.append(text);
- otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
- } else {
- LOG.warning("Special link: " + englishTokenizer.token());
- // TODO: something here...
- }
- } else {
- // link == null
- englishBuilder.append(text);
- if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) {
- enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
- }
- }
- } else if (englishTokenizer.isFunction()) {
- final String name = englishTokenizer.functionName();
- if (name.contains(" conjugation of ") ||
- name.contains(" form of ") ||
- name.contains(" feminine of ") ||
- name.contains(" plural of ")) {
- // Ignore these in the index, they're really annoying....
- englishBuilder.append(englishTokenizer.token());
- } else {
- LOG.warning("Unexpected function: " + englishTokenizer.token());
- }
- } else {
- if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) {
- } else {
- LOG.warning("Unexpected definition text: " + englishTokenizer.token());
- }
- }
- }
- final String english = trim(englishBuilder.toString());
- if (english.length() > 0) {
- final Pair pair = new Pair(english, trim(foreignText), this.swap);
- pairEntry.pairs.add(pair);
- otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
- for (final String form : forms) {
- otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI);
- }
- }
+ static final Pattern whitespace = Pattern.compile("\\s+");
+ static String trim(final String s) {
+ return whitespace.matcher(s).replaceAll(" ").trim();
}