+++ /dev/null
-package com.hughes.android.dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Pattern;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-import com.hughes.android.dictionary.engine.Dictionary;
-import com.hughes.util.MapUtil;
-import com.hughes.util.StringUtil;
-
-public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
-
- final Dictionary dict;
-
- StringBuilder titleBuilder;
- StringBuilder textBuilder;
- StringBuilder currentBuilder = null;
-
- public WiktionaryXmlParser(final Dictionary dict) {
- this.dict = dict;
- }
-
- @Override
- public void startElement(String uri, String localName, String qName,
- Attributes attributes) {
- currentBuilder = null;
- if ("page".equals(qName)) {
- titleBuilder = new StringBuilder();
- textBuilder = new StringBuilder();
- } else if ("title".equals(qName)) {
- currentBuilder = titleBuilder;
- } else if ("text".equals(qName)) {
- currentBuilder = textBuilder;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- if (currentBuilder != null) {
- currentBuilder.append(ch, start, length);
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
- currentBuilder = null;
- if ("page".equals(qName)) {
- endPage();
- }
- }
-
- private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
-
- // MULTILINE for ^
- private static final Pattern SECTION_HEADER = Pattern
- .compile("=== *\\{\\{Wortart\\|");
-
- private static final Pattern WORTART_DELIM = Pattern.compile("===",
- Pattern.LITERAL);
- private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
-
- private static final Pattern WIKI_QUOTE = Pattern.compile("''",
- Pattern.LITERAL);
- private static final Pattern WIKI_DOUBLE_BRACE = Pattern
- .compile("\\{\\{([^}]+)\\}\\}");
- private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
- .compile("\\[\\[([^\\]]+)\\]\\]");
- private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE);
-
- enum Field {
- Wortart("Wortart", null),
-
- Aussprache("Aussprache", null),
-
- Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")),
-
- Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")),
-
- Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")),
-
- Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")),
-
- Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")),
-
- Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")),
-
- Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")),
-
- Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")),
-
- CharakteristischeWortkombinationen("Charakteristische Wortkombinationen",
- Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")),
-
- AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern
- .compile("\\{\\{Abgeleitete Begriffe\\}\\}")),
-
- Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")),
-
- Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")),
-
- ;
-
- final String name;
- final Pattern listPattern;
-
- Field(final String name, final Pattern listPattern) {
- this.name = name;
- this.listPattern = listPattern;
- }
- }
-
- private static final Pattern WORTART = Pattern
- .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
- private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
- Pattern.LITERAL);
-
- private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
-
- private void endPage() {
-
- StringBuilder text = textBuilder;
- text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
- text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
-
- // Remove comments.
- StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
- Pattern.compile("-->", Pattern.LITERAL));
-
- String sectionString;
- while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
- SECTION_HEADER, false)) != null) {
- final StringBuilder section = new StringBuilder(sectionString);
-
- String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
- true);
- if (wortart.contains("\n") || !wortart.contains("eutsch")) {
- MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
- AtomicInteger.class).incrementAndGet();
- continue;
- }
-
- final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
-
- wortart = wortart.replaceAll("===", "");
- wortart = WORTART.matcher(wortart).replaceAll("$1");
- wortart = GENDER.matcher(wortart).replaceAll("{$1}");
- wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
- wortart = wortart.replaceAll("Wortart\\|", "");
- wortart = wortart.trim();
- fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
-
- String aussprache = StringUtil
- .remove(section, AUSSPRACHE, NEWLINE, false);
- if (aussprache != null) {
- aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
- aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
- aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", "");
- aussprache = aussprache.trim();
- fieldToValue.put(Field.Aussprache, Collections
- .singletonList(aussprache));
- }
-
- for (final Field field : Field.values()) {
- if (field.listPattern != null) {
- fieldToValue.put(field, extractList(section, field.listPattern));
- }
- }
-
- System.out.println(titleBuilder);
- for (final Field field : Field.values()) {
- if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) {
- fieldToValue.remove(field);
- } else {
- if (field.name != null) {
-// System.out.println(field.name);
-// for (final String line : fieldToValue.get(field)) {
-// System.out.println(" " + line);
-// }
- }
- }
- }
-// System.out.println("WHAT'S LEFT:");
-// System.out.println(section);
-// System.out.println("------------------------------------------------");
-
- }
-
- }
-
- private List<String> extractList(final StringBuilder section,
- final Pattern start) {
- final List<String> result = new ArrayList<String>();
- final String linesString = StringUtil.remove(section, start,
- WIKI_NEW_SECTION, false);
- if (linesString != null) {
- String[] lines = linesString.split("\n");
- for (int i = 1; i < lines.length; ++i) {
- String bedeutung = lines[i];
- bedeutung = bedeutung.replaceFirst("^:+", "");
- bedeutung = bedeutung.trim();
- if (bedeutung.length() > 0) {
- result.add(bedeutung);
- }
- }
- }
- return result;
- }
-
- void parse(final File file) throws ParserConfigurationException,
- SAXException, IOException {
- final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
- parser.parse(file, this);
- System.out.println(errorCounts);
- }
-
-}
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.parser.DictFileParser;
+import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser;
import com.hughes.util.Args;
import com.hughes.util.FileUtil;
public class DictionaryBuilder {
- final Dictionary dictionary;
-
- final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
+ public final Dictionary dictionary;
+ public final List<IndexBuilder> indexBuilders = new ArrayList<IndexBuilder>();
- public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) {
+ public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2) {
dictionary = new Dictionary(dictInfo);
- indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, false));
- indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, true));
+ indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, false));
+ indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, true));
}
void build() {
}
}
- public static void main(final String[] args) throws IOException {
+ public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException {
final Map<String,String> keyValueArgs = Args.keyValueArgs(args);
final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));
fatalError("--lang1= and --lang2= must both be specified.");
}
+ String normalizerRules1 = keyValueArgs.remove("normalizerRules1");
+ String normalizerRules2 = keyValueArgs.remove("normalizerRules2");
+ if (normalizerRules1 == null) {
+ normalizerRules1 = lang1.getDefaultNormalizerRules();
+ }
+ if (normalizerRules2 == null) {
+ normalizerRules2 = lang2.getDefaultNormalizerRules();
+ }
+
final String dictOutFilename = keyValueArgs.remove("dictOut");
if (dictOutFilename == null) {
fatalError("--dictOut= must be specified.");
System.out.println("lang1=" + lang1);
System.out.println("lang2=" + lang2);
+ System.out.println("normalizerRules1=" + normalizerRules1);
+ System.out.println("normalizerRules2=" + normalizerRules2);
System.out.println("dictInfo=" + dictInfo);
System.out.println("dictOut=" + dictOutFilename);
- final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2);
+ final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2);
for (int i = 0; i < 100; ++i) {
final String prefix = "input" + i;
new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
} else if ("chemnitz".equals(inputFormat)) {
new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file);
- } else if ("wiktionary".equals(inputFormat)) {
- throw new RuntimeException();
-// new WiktionaryXmlParser(dict).parse(file);
+ } else if ("enwiktionary".equals(inputFormat)) {
+ final Pattern[] translationPatterns = new Pattern[2];
+ translationPatterns[0] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern1"));
+ translationPatterns[1] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern2"));
+ final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1;
+ if (enIndex < 0 || enIndex >= 2) {
+ fatalError("Must be 1 or 2: " + prefix + "EnIndex");
+ }
+ new EnWiktionaryXmlParser(dictionaryBuilder, translationPatterns, enIndex).parse(file);
} else {
fatalError("Invalid or missing input format: " + inputFormat);
}
public class DictionaryBuilderTest extends TestCase {
- public void testGermanCombined() throws IOException {
- final File result = new File("testdata/de_en.dict");
+ public void testGermanCombined() throws Exception {
+ final File result = new File("testdata/de-en.quickdic");
System.out.println("Writing to: " + result);
DictionaryBuilder.main(new String[] {
"--dictOut=" + result.getAbsolutePath(),
"--lang2=EN",
"--dictInfo=@testdata/de-en_dictInfo.txt",
- "--input1=testdata/de-en_chemnitz_100",
- "--input1Name=dictcc",
- "--input1Charset=UTF8",
- "--input1Format=chemnitz",
+// "--input1=testdata/de-en_chemnitz_100",
+// "--input1Name=dictcc",
+// "--input1Charset=UTF8",
+// "--input1Format=chemnitz",
+//
+// "--input2=testdata/de-en_dictcc_100",
+// "--input2Name=dictcc",
+// "--input2Charset=UTF8",
+// "--input2Format=dictcc",
+
+ "--input3=testdata/enwiktionary_small.xml",
+ "--input3Name=enwiktionary",
+ "--input3Format=enwiktionary",
+ "--input3TranslationPattern1=[Gg]erman",
+ "--input3TranslationPattern2=[Ee]glish",
+ "--input3EnIndex=2",
- "--input2=testdata/de-en_dictcc_100",
- "--input2Name=dictcc",
- "--input2Charset=UTF8",
- "--input2Format=dictcc",
-
"--print=testdata/de-en.test",
});
// Check it once:
- assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+ assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test");
// Check it again.
final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
- final PrintStream out = new PrintStream(new File("testdata/de_en.test"));
+ final PrintStream out = new PrintStream(new File("testdata/de-en.test"));
dict.print(out);
out.close();
- assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test");
+ assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test");
}
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import junit.framework.TestCase;
+
+public class DictionaryBuilder_DE extends TestCase {
+
+ public static void main(final String[] args) throws Exception {
+
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=dictOutputs/de-en_chemnitz.quickdic",
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@dictInputs/de-en_chemnitz.info",
+
+ "--input1=dictInputs/de-en_chemnitz.txt",
+ "--input1Name=dictcc",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+ });
+
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=dictOutputs/de-en_all.quickdic",
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@dictInputs/de-en_all.info",
+
+ "--input1=dictInputs/de-en_chemnitz.txt",
+ "--input1Name=dictcc",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+
+ "--input2=dictInputs/de-en_dictcc.txt",
+ "--input2Name=dictcc",
+ "--input2Charset=UTF8",
+ "--input2Format=dictcc",
+ });
+
+ }
+
+}
import junit.framework.TestCase;
-import com.hughes.android.dictionary.engine.Index.SearchResult;
+import com.hughes.android.dictionary.engine.Index.IndexEntry;
+import com.ibm.icu.text.Transliterator;
public class DictionaryTest extends TestCase {
public void testGermanMetadata() throws IOException {
- final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+ final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r");
final Dictionary dict = new Dictionary(raf);
final Index deIndex = dict.indices.get(0);
for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) {
System.out.println("testing: " + indexEntry.token);
- final Index.SearchResult searchResult = deIndex.findLongestSubstring(indexEntry.token, new AtomicBoolean(
+ final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean(
false));
- assertEquals(indexEntry.token.toLowerCase(), searchResult.insertionPoint.token.toLowerCase());
- assertEquals(indexEntry.token.toLowerCase(), searchResult.longestPrefix.token.toLowerCase());
+ assertEquals(indexEntry.token.toLowerCase(), searchResult.token.toLowerCase());
}
// TODO: maybe if user types capitalization, use it.
- assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aaac", new AtomicBoolean(false)));
- assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAC", new AtomicBoolean(false)));
- assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAc", new AtomicBoolean(false)));
- assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aAac", new AtomicBoolean(false)));
+ assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false)));
+ assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false)));
+ assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false)));
+ assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false)));
// Before the beginning.
- assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("", new AtomicBoolean(false)));
- assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("__", new AtomicBoolean(false)));
+ assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false)));
+ assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false)));
// After the end.
- assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findLongestSubstring("ZZZZZ", new AtomicBoolean(false)));
-
- assertSearchResult("ab", "aaac", deIndex.findLongestSubstring("aaaca", new AtomicBoolean(false)));
- assertSearchResult("machen", "machen", deIndex.findLongestSubstring("m", new AtomicBoolean(false)));
+ assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false)));
- assertFalse(deIndex.findLongestSubstring("macdddd", new AtomicBoolean(false)).success);
+ assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false)));
+ assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false)));
+ assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false)));
- assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberprüfe", new AtomicBoolean(false)));
- assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpruefe", new AtomicBoolean(false)));
+ assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false)));
+ assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false)));
- assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpBLEH", new AtomicBoolean(false)));
- assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("überprBLEH", new AtomicBoolean(false)));
+ assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false)));
+ assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false)));
- assertSearchResult("überprüfen", "überprüfe", deIndex.findLongestSubstring("überprüfeBLEH", new AtomicBoolean(false)));
+ assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false)));
// Check that search in lowercase works.
- assertSearchResult("Alibi", "Alibi", deIndex.findLongestSubstring("alib", new AtomicBoolean(false)));
- assertTrue(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).success);
- System.out.println(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).toString());
+ assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false)));
+ System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString());
raf.close();
}
private void assertSearchResult(final String insertionPoint, final String longestPrefix,
- final SearchResult actual) {
- assertEquals(insertionPoint, actual.insertionPoint.token);
- assertEquals(longestPrefix, actual.longestPrefix.token);
+ final IndexEntry actual) {
+ assertEquals(insertionPoint, actual.token);
}
public void testGermanTokenRows() throws IOException {
- final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r");
+ final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r");
final Dictionary dict = new Dictionary(raf);
final Index deIndex = dict.indices.get(0);
}
public void testGermanSort() {
- assertEquals("aüÄÄ", Language.de.textNorm("aueAeAE", false));
+ final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD);
+ assertEquals("aüääss", normalizer.transform("aueAeAEß"));
final List<String> words = Arrays.asList(
"er-ben",
"erben",
"Großformats",
"Großpoo",
"Großpoos",
+ "Hörvermögen",
"Hörweite",
"hos",
"Höschen",
"Hostel",
"hulle",
"Hulle",
- "hülle",
"huelle",
- "Hülle",
"Huelle",
+ "hülle",
+ "Hülle",
+ "Huellen",
+ "Hüllen",
"Hum"
);
- assertEquals(0, Language.de.sortComparator.compare("hülle", "huelle"));
- assertEquals(0, Language.de.sortComparator.compare("huelle", "hülle"));
+ final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator());
+ assertEquals(1, comparator.compare("hülle", "huelle"));
+ assertEquals(-1, comparator.compare("huelle", "hülle"));
- assertEquals(-1, Language.de.sortComparator.compare("hülle", "Hülle"));
- assertEquals(0, Language.de.findComparator.compare("hülle", "Hülle"));
- assertEquals(-1, Language.de.findComparator.compare("hulle", "Hülle"));
+ assertEquals(-1, comparator.compare("hülle", "Hülle"));
+
+ assertEquals("hülle", normalizer.transform("Hülle"));
+ assertEquals("hulle", normalizer.transform("Hulle"));
- for (final String s : words) {
- System.out.println(s + "\t" + Language.de.textNorm(s, false));
- }
final List<String> sorted = new ArrayList<String>(words);
// Collections.shuffle(shuffled, new Random(0));
- Collections.sort(sorted, Language.de.sortComparator);
+ Collections.sort(sorted, comparator);
System.out.println(sorted.toString());
for (int i = 0; i < words.size(); ++i) {
System.out.println(words.get(i) + "\t" + sorted.get(i));
}
}
- @SuppressWarnings("unchecked")
public void testEnglishSort() {
+ final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD);
final List<String> words = Arrays.asList(
"pre-print",
"preprocess");
final List<String> sorted = new ArrayList<String>(words);
- Collections.sort(sorted, Language.en.getSortCollator());
+ final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator());
+ Collections.sort(sorted, comparator);
for (int i = 0; i < words.size(); ++i) {
if (i > 0) {
- assertTrue(Language.en.getSortCollator().compare(words.get(i-1), words.get(i)) < 0);
+ assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0);
}
System.out.println(words.get(i) + "\t" + sorted.get(i));
assertEquals(words.get(i), sorted.get(i));
}
- assertTrue(Language.en.getSortCollator().compare("pre-print", "preppy") < 0);
+ assertTrue(comparator.compare("pre-print", "preppy") < 0);
}
}
public void testTextNorm() {
- assertEquals("hoschen", "Höschen".toLowerCase(Language.de.locale));
+ //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD);
+ final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD);
+ assertEquals("hoschen", transliterator.transliterate("Höschen"));
+ assertEquals("hoschen", transliterator.transliterate("Hoeschen"));
+ assertEquals("grosspoo", transliterator.transliterate("Großpoo"));
+
+ assertEquals("kyanpasu", transliterator.transliterate("キャンパス"));
+ assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος"));
+ assertEquals("biologiceskom", transliterator.transliterate("биологическом"));
}
public void testChemnitz() throws IOException {
- final RandomAccessFile raf = new RandomAccessFile("testdata/de-en_chemnitz.dict", "r");
+ final RandomAccessFile raf = new RandomAccessFile("dictOutputs/de-en_chemnitz.quickdic", "r");
final Dictionary dict = new Dictionary(raf);
final Index deIndex = dict.indices.get(0);
- //assertSearchResult("Höschen", "Hos", deIndex.findLongestSubstring("Hos", new AtomicBoolean(false)));
- //assertSearchResult("Höschen", "hos", deIndex.findLongestSubstring("hos", new AtomicBoolean(false)));
-
+ assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false)));
+ assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false)));
raf.close();
}
import com.hughes.util.IndexedObject;
-class EntryData extends IndexedObject {
- EntryData(final int index, final Entry entry) {
+public class EntryData extends IndexedObject {
+ public EntryData(final int index, final Entry entry) {
super(index);
this.entry = entry;
}
public class IndexBuilder {
final DictionaryBuilder dictionaryBuilder;
- final Index index;
+ public final Index index;
final SortedMap<String, TokenData> tokenToData;
- @SuppressWarnings("unchecked")
- IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final boolean swapPairEntries) {
+ IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) {
this.dictionaryBuilder = dictionaryBuilder;
- index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, swapPairEntries);
- tokenToData = new TreeMap<String, TokenData>(language.getSortCollator());
+ index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries);
+ tokenToData = new TreeMap<String, TokenData>(new NormalizeComparator(index.normalizer, language.collator));
}
public void build() {
--- /dev/null
+package com.hughes.android.dictionary.engine;
+
+import java.util.Comparator;
+
+import com.ibm.icu.text.Transliterator;
+
+public class NormalizeComparator implements Comparator<String> {
+
+ final Transliterator normalizer;
+ final Comparator<Object> comparator;
+
+ public NormalizeComparator(final Transliterator normalizer,
+ final Comparator<Object> comparator) {
+ this.normalizer = normalizer;
+ this.comparator = comparator;
+ }
+
+ @Override
+ public int compare(final String s1, final String s2) {
+ final String n1 = normalizer.transform(s1);
+ final String n2 = normalizer.transform(s2);
+ final int cn = comparator.compare(n1, n2);
+ if (cn != 0) {
+ return cn;
+ }
+ return comparator.compare(s1, s2);
+ }
+
+}
-package com.hughes.android.dictionary.engine;
+package com.hughes.android.dictionary.parser;
import java.io.BufferedReader;
import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.EntryData;
+import com.hughes.android.dictionary.engine.EntryTypeName;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.Language;
+import com.hughes.android.dictionary.engine.PairEntry;
import com.hughes.android.dictionary.engine.PairEntry.Pair;
public class DictFileParser {
static final Logger logger = Logger.getLogger(DictFileParser.class.getName());
// Dictcc
- static final Pattern TAB = Pattern.compile("\\t");
+ public static final Pattern TAB = Pattern.compile("\\t");
// Chemnitz
- static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
- static final Pattern PIPE = Pattern.compile("\\|");
+ public static final Pattern DOUBLE_COLON = Pattern.compile(" :: ");
+ public static final Pattern PIPE = Pattern.compile("\\|");
static final Pattern SPACES = Pattern.compile("\\s+");
static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}");
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+
+public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+
+ final DictionaryBuilder dict;
+
+ final IndexBuilder[] indexBuilders;
+ final Pattern[] langPatterns;
+
+ StringBuilder titleBuilder;
+ StringBuilder textBuilder;
+ StringBuilder currentBuilder = null;
+
+ public EnWiktionaryXmlParser(final DictionaryBuilder builder, final Pattern[] langPatterns, final int enIndexBuilder) {
+ assert langPatterns.length == 2;
+ this.dict = builder;
+ this.indexBuilders = dict.indexBuilders.toArray(new IndexBuilder[0]);
+ this.langPatterns = langPatterns;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ titleBuilder = new StringBuilder();
+
+ // Start with "\n" to better match certain strings.
+ textBuilder = new StringBuilder("\n");
+ } else if ("title".equals(qName)) {
+ currentBuilder = titleBuilder;
+ } else if ("text".equals(qName)) {
+ currentBuilder = textBuilder;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (currentBuilder != null) {
+ currentBuilder.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ endPage();
+ }
+ }
+
+
+ public void parse(final File file) throws ParserConfigurationException,
+ SAXException, IOException {
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(file, this);
+ }
+
+ private void endPage() {
+ title = titleBuilder.toString();
+ currentDepth = 0;
+ words.clear();
+ WikiParser.parse(textBuilder.toString(), this);
+ }
+
+ /**
+ * Two things can happen:
+ *
+ * We can be in a ==German== section. There we will see English definitions.
+ * Each POS should get its own QuickDic entry. Pretty much everything goes
+ * in.
+ *
+ * Or we can be in an ==English== section with English definitions
+ * and maybe see translations for languages we care about.
+ *
+ * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
+ * into separate QuickDic entries, but that's tricky--how do we know when we
+ * found a subsection? Just ignore anything containing pronunciation and
+ * etymology?
+ *
+ * How do we decide when to seal the deal on an entry?
+ *
+ * Would be nice if the parser told us about leaving sections....
+ *
+ *
+ */
+
+ String title;
+ int currentDepth;
+ final List<WikiWord> words = new ArrayList<WikiWord>();
+ WikiWord currentWord;
+ WikiWord.PartOfSpeech currentPartOfSpeech;
+ WikiWord.TranslationSection currentTranslationSection;
+
+ StringBuilder wikiBuilder = null;
+
+ // ------------------------------------------------------------------------
+
+ @Override
+ public void onWikiLink(String[] args) {
+ if (wikiBuilder != null) {
+ wikiBuilder.append(args[args.length - 1]);
+ }
+ }
+
+ @Override
+ public void onTemplate(String[][] args) {
+ final String name = args[0][1];
+ if (name == "") {
+
+ } else {
+ //System.out.println("Unhandled template: " + name);
+ }
+ }
+
+ @Override
+ public void onText(String text) {
+ if (wikiBuilder != null) {
+ wikiBuilder.append(text);
+ return;
+ }
+ }
+
+ @Override
+ public void onHeadingStart(int depth) {
+ wikiBuilder = new StringBuilder();
+ currentDepth = depth;
+ if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+ currentPartOfSpeech = null;
+ }
+ if (currentWord != null && depth <= currentWord.depth) {
+ currentWord = null;
+ }
+ }
+
+ final Pattern partOfSpeechHeader = Pattern.compile(
+ "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+ "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+ "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+ "Ligature|Idiom|Phrase|" +
+ // These are @deprecated:
+ "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+ "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
+
+ @Override
+ public void onHeadingEnd(int depth) {
+ final String name = wikiBuilder.toString().trim();
+ wikiBuilder = null;
+
+ final boolean lang1 = langPatterns[0].matcher(name).matches();
+ final boolean lang2 = langPatterns[1].matcher(name).matches();
+ if (name.equalsIgnoreCase("English") || lang1 || lang2) {
+ currentWord = new WikiWord(depth);
+ currentWord.language = name;
+ currentWord.isLang1 = lang1;
+ currentWord.isLang2 = lang2;
+ words.add(currentWord);
+ return;
+ }
+
+ if (currentWord == null) {
+ return;
+ }
+
+ if (partOfSpeechHeader.matcher(name).matches()) {
+ currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
+ currentWord.partsOfSpeech.add(currentPartOfSpeech);
+ return;
+ }
+
+ if (name.equals("Translations")) {
+ if (currentWord == null ||
+ !currentWord.language.equals("English") ||
+ currentPartOfSpeech == null) {
+ System.out.println("Unexpected Translations section: " + title);
+ return;
+ }
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+ } else {
+ currentTranslationSection = null;
+ }
+ }
+
+ @Override
+ public void onListItemStart(String header, int[] section) {
+ wikiBuilder = new StringBuilder();
+ }
+
+
+ @Override
+ public void onListItemEnd(String header, int[] section) {
+ final String item = wikiBuilder.toString();
+ wikiBuilder = null;
+
+ if (currentTranslationSection != null) {
+ final int colonPos = item.indexOf(':');
+ if (colonPos == -1) {
+ System.out.println("Invalid translation: " + item);
+ return;
+ }
+ final String lang = item.substring(0, colonPos);
+ final String trans = item.substring(colonPos + 1);
+ for (int i = 0; i < 2; ++i) {
+ if (langPatterns[i].matcher(lang).find()) {
+ currentTranslationSection.translations.get(i).add(trans);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void onNewLine() {
+ }
+
+ @Override
+ public void onNewParagraph() {
+ }
+
+ // ----------------------------------------------------------------------
+
+ public void onTransTrop(final String[][] args) {
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+
+ if (args.length > 1) {
+ currentTranslationSection.sense = args[1][1];
+ }
+ }
+
+
+ // ----------------------------------------------------------------------
+
+ @Override
+ public void onComment(String text) {
+ }
+
+ @Override
+ public void onFormatBold(boolean boldOn) {
+ }
+
+ @Override
+ public void onFormatItalic(boolean italicOn) {
+ }
+
+ @Override
+ public void onUnterminated(String start, String rest) {
+ throw new RuntimeException(rest);
+ }
+ @Override
+ public void onInvalidHeaderEnd(String rest) {
+ throw new RuntimeException(rest);
+ }
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+
+public interface WikiCallback {
+
+ void onComment(final String text);
+
+ void onFormatBold(final boolean boldOn);
+ void onFormatItalic(final boolean italicOn);
+
+ void onWikiLink(final String[] args);
+
+ void onTemplate(final String[][] args);
+
+ // Will never contain a newline unless it's in a <pre>
+ void onText(final String text);
+
+ // Only at start of line.
+ void onHeadingStart(final int depth);
+ void onHeadingEnd(final int depth);
+
+
+ void onNewLine();
+ void onNewParagraph();
+
+ void onListItemStart(final String header, final int[] section);
+ void onListItemEnd(final String header, final int[] section);
+
+ // Errors
+ void onUnterminated(final String start, String rest);
+ void onInvalidHeaderEnd(String rest);
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WikiParser {
+
+ private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|^[*#;:]+|^(==+)\\s*|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
+ private static final Pattern listStart = Pattern.compile("^[*#;:]");
+ private static final Pattern pipeSplit = Pattern.compile("\\s*\\|\\s*");
+ private static final Pattern whitespace = Pattern.compile("\\s+");
+
+ static void parse(final String wikiText, final WikiCallback callback) {
+
+ boolean boldOn = false;
+ boolean italicOn = false;
+ int insideHeaderDepth = -1;
+ String lastListItem = null;
+
+ String rest = wikiText;
+ while (rest.length() > 0) {
+ final Matcher matcher = markup.matcher(rest);
+ if (matcher.find()) {
+ final int nextMarkupPos = matcher.start();
+ if (nextMarkupPos != 0) {
+ String text = rest.substring(0, nextMarkupPos);
+ whitespace.matcher(text).replaceAll(" ");
+ callback.onText(text);
+ }
+ rest = rest.substring(nextMarkupPos);
+
+ if (rest.startsWith("\n")) {
+ if (insideHeaderDepth != -1) {
+ throw new RuntimeException("barf");
+ }
+ if (lastListItem != null) {
+ callback.onListItemEnd(lastListItem, null);
+ }
+ if (!listStart.matcher(rest.substring(1)).matches()) {
+ lastListItem = null;
+ }
+ if (rest.startsWith("\n\n")) {
+ // TODO(thadh): eat all the newlines.
+ callback.onNewParagraph();
+ rest = rest.substring(2);
+ } else {
+ callback.onNewLine();
+ rest = rest.substring(1);
+ }
+ } else if (rest.startsWith("'''")) {
+ boldOn = !boldOn;
+ callback.onFormatBold(boldOn);
+ rest = rest.substring(3);
+ } else if (rest.startsWith("''")) {
+ italicOn = !italicOn;
+ callback.onFormatItalic(italicOn);
+ rest = rest.substring(2);
+ } else if (rest.startsWith("{{")) {
+ int end = rest.indexOf("}}");
+ if (end == -1) {
+ callback.onUnterminated("{{", rest);
+ return;
+ }
+ final String template = rest.substring(2, end).trim();
+ final String[] templateArray = pipeSplit.split(template);
+ final String[][] templateArgs = new String[templateArray.length][];
+ for (int i = 0; i < templateArray.length; ++i) {
+ int equalPos = templateArray[i].indexOf('=');
+ if (equalPos == -1) {
+ templateArgs[i] = new String[] { null, templateArray[i] };
+ } else {
+ templateArgs[i] = new String[] { templateArray[i].substring(0, equalPos), templateArray[i].substring(equalPos + 1) };
+ }
+ }
+ callback.onTemplate(templateArgs);
+ rest = rest.substring(end + 2);
+ } else if (rest.startsWith("[[")) {
+ int end = rest.indexOf("]]");
+ if (end == -1) {
+ callback.onUnterminated("[[", rest);
+ return;
+ }
+ final String wikiLink = rest.substring(2, end);
+ final String[] args = pipeSplit.split(wikiLink);
+ callback.onWikiLink(args);
+ rest = rest.substring(end + 2);
+ } else if (rest.startsWith("=")) {
+ final String match = matcher.group(1) != null ? matcher.group(1) : matcher.group(2);
+ if (insideHeaderDepth == -1) {
+ insideHeaderDepth = match.length();
+ callback.onHeadingStart(insideHeaderDepth);
+ } else {
+ if (match.length() != insideHeaderDepth) {
+ callback.onInvalidHeaderEnd(rest);
+ return;
+ }
+ callback.onHeadingEnd(insideHeaderDepth);
+ insideHeaderDepth = -1;
+ }
+ rest = rest.substring(match.length());
+ } else if (rest.startsWith("*") || rest.startsWith("#") || rest.startsWith(";") || rest.startsWith(":")) {
+ lastListItem = matcher.group();
+ callback.onListItemStart(lastListItem, null);
+ rest = rest.substring(lastListItem.length());
+ } else if (rest.startsWith("<!--")) {
+ int end = rest.indexOf("-->");
+ if (end == -1) {
+ callback.onUnterminated("<!--", rest);
+ return;
+ }
+ callback.onComment(rest.substring(4, end));
+ rest = rest.substring(end + 3);
+ } else if (rest.startsWith("<pre>")) {
+ int end = rest.indexOf("</pre>");
+ if (end == -1) {
+ callback.onUnterminated("<pre>", rest);
+ return;
+ }
+ callback.onText(rest.substring(5, end));
+ rest = rest.substring(end + 6);
+ } else {
+ throw new RuntimeException("barf!");
+ }
+ } // matcher.find()
+ }
+ }
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import junit.framework.TestCase;
+
+public class WikiParserTest extends TestCase {
+
+ public void testSimple() {
+ final String text =
+ "Hi" + "\n" +
+ "Hello ''thad'' you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+ "hi <!--" + "\n" +
+ "multi-line" + "\n" +
+ "# comment -->" + "\n" +
+ "" + "\n" +
+ "# li" + "\n" +
+ "# li2" + "\n" +
+ "## li2.2" + "\n" +
+ "Hi again." + "\n" +
+ "here's [[some blah|some]] wikitext." + "\n" +
+ "here's a {{template|blah=2|blah2=3|" + "\n" +
+ "blah3=3}} and some more text." + "\n" +
+ "== Header 2 ==" + "\n" +
+ "=== {{header-template}} ===" + "\n";
+
+ final String expected = "Hi Hello <i>thad</i> you're \n" +
+ "comment: not \n" +
+ " <b>pretty</b> cool <b><i>over</b></i> there. hi \n" +
+ "comment:\n" +
+ "multi-line\n" +
+ "# comment \n" +
+ "\n" +
+ "\n" +
+ "# li\n" +
+ " # li2\n" +
+ " ## li2.2\n" +
+ " Hi again. here's [[some]] wikitext. here's a \n" +
+ "template:template\n" +
+ " and some more text. \n" +
+ "HEADER Header 2 \n" +
+ " \n" +
+ "HEADER \n" +
+ "template:header-template\n" +
+ " \n" +
+ " ";
+ final PrintWikiCallback callback = new PrintWikiCallback();
+ WikiParser.parse(text, callback);
+ assertEquals(expected, callback.builder.toString());
+
+ }
+
+
+ static final class PrintWikiCallback implements WikiCallback {
+ final StringBuilder builder = new StringBuilder();
+
+ @Override
+ public void onComment(String text) {
+ builder.append("\ncomment:").append(text).append("\n");
+ }
+
+ @Override
+ public void onFormatBold(boolean boldOn) {
+ builder.append(boldOn ? "<b>" : "</b>");
+ }
+
+ @Override
+ public void onFormatItalic(boolean italicOn) {
+ builder.append(italicOn ? "<i>" : "</i>");
+ }
+
+ @Override
+ public void onWikiLink(String[] args) {
+ builder.append("[[").append(args[args.length - 1]).append("]]");
+ }
+
+ @Override
+ public void onTemplate(String[][] args) {
+ builder.append("\ntemplate:").append(args[0][0]).append("\n");
+ }
+
+ @Override
+ public void onText(String text) {
+ builder.append(text);
+ }
+
+ @Override
+ public void onHeadingStart(int depth) {
+ builder.append("\nHEADER");
+ for (int i = 0; i < depth; ++i) {
+ builder.append(" ");
+ }
+ }
+
+ @Override
+ public void onHeadingEnd(int depth) {
+ builder.append("\n");
+ }
+
+ @Override
+ public void onNewLine() {
+ builder.append(" ");
+ }
+
+ @Override
+ public void onNewParagraph() {
+ builder.append("\n\n");
+ }
+
+ @Override
+ public void onListItemStart(String header, int[] section) {
+ builder.append(header);
+ }
+
+ @Override
+ public void onListItemEnd(String header, int[] section) {
+ builder.append("\n");
+ }
+
+ @Override
+ public void onUnterminated(String start, String rest) {
+ throw new RuntimeException("bad");
+ }
+
+ @Override
+ public void onInvalidHeaderEnd(String rest) {
+ throw new RuntimeException("bad");
+ }
+
+ }
+
+
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class WikiWord {
+ final int depth;
+
+ String language;
+ String pronunciation;
+
+ boolean isLang1;
+ boolean isLang2;
+
+ final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
+
+ final Map<String, List<String>> otherSections = new LinkedHashMap<String, List<String>>();
+
+ public WikiWord(int depth) {
+ this.depth = depth;
+ }
+
+ static class PartOfSpeech {
+ final int depth;
+
+ final List<Meaning> meaning = new ArrayList<WikiWord.Meaning>();
+
+ final List<TranslationSection> translationSections = new ArrayList<WikiWord.TranslationSection>();
+
+ final Map<String, String> otherSections = new LinkedHashMap<String, String>();
+
+ public PartOfSpeech(final int depth) {
+ this.depth = depth;
+ }
+ }
+
+ static class TranslationSection {
+ String sense;
+ List<List<String>> translations = new ArrayList<List<String>>();
+ {
+ translations.add(new ArrayList<String>());
+ translations.add(new ArrayList<String>());
+ }
+ }
+
+ static class Meaning {
+ String meaning;
+ Example example;
+ }
+
+ static class Example {
+ String example;
+ String exampleInEnglish;
+ }
+
+}