From: Thad Hughes Date: Fri, 5 Nov 2010 19:27:18 +0000 (-0700) Subject: go X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=a91fcd717450fa7cbbc2b260c265d0a66a232dc6 go --- diff --git a/src/com/hughes/android/dictionary/WiktionaryXmlParser.java b/src/com/hughes/android/dictionary/WiktionaryXmlParser.java deleted file mode 100644 index 31d8c92..0000000 --- a/src/com/hughes/android/dictionary/WiktionaryXmlParser.java +++ /dev/null @@ -1,231 +0,0 @@ -package com.hughes.android.dictionary; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Pattern; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import com.hughes.android.dictionary.engine.Dictionary; -import com.hughes.util.MapUtil; -import com.hughes.util.StringUtil; - -public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler { - - final Dictionary dict; - - StringBuilder titleBuilder; - StringBuilder textBuilder; - StringBuilder currentBuilder = null; - - public WiktionaryXmlParser(final Dictionary dict) { - this.dict = dict; - } - - @Override - public void startElement(String uri, String localName, String qName, - Attributes attributes) { - currentBuilder = null; - if ("page".equals(qName)) { - titleBuilder = new StringBuilder(); - textBuilder = new StringBuilder(); - } else if ("title".equals(qName)) { - currentBuilder = titleBuilder; - } else if ("text".equals(qName)) { - currentBuilder = textBuilder; - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (currentBuilder != null) { - currentBuilder.append(ch, start, length); - } - } - - @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - currentBuilder = null; - if ("page".equals(qName)) { - endPage(); - } - } - - private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL); - - // MULTILINE for ^ - private static final Pattern SECTION_HEADER = Pattern - .compile("=== *\\{\\{Wortart\\|"); - - private static final Pattern WORTART_DELIM = Pattern.compile("===", - Pattern.LITERAL); - private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}"); - - private static final Pattern WIKI_QUOTE = Pattern.compile("''", - Pattern.LITERAL); - private static final Pattern WIKI_DOUBLE_BRACE = Pattern - .compile("\\{\\{([^}]+)\\}\\}"); - private static final Pattern WIKI_DOUBLE_BRACKET = Pattern - .compile("\\[\\[([^\\]]+)\\]\\]"); - private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE); - - enum Field { - Wortart("Wortart", null), - - Aussprache("Aussprache", null), - - Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), - - Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")), - - Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), - - Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), - - Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), - - Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), - - Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), - - Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), - - CharakteristischeWortkombinationen("Charakteristische Wortkombinationen", - Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), - - AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern - .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), - - Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")), - - Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")), - - ; - - final String name; - final Pattern listPattern; - - Field(final String name, final Pattern listPattern) { - this.name = name; - this.listPattern = listPattern; - } - } - - private static final Pattern WORTART = Pattern - .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}"); - private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:", - Pattern.LITERAL); - - private final Map errorCounts = new TreeMap(); - - private void endPage() { - - StringBuilder text = textBuilder; - text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\"")); - text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1")); - - // Remove comments. - StringUtil.removeAll(text, Pattern.compile("", Pattern.LITERAL)); - - String sectionString; - while ((sectionString = StringUtil.remove(text, SECTION_HEADER, - SECTION_HEADER, false)) != null) { - final StringBuilder section = new StringBuilder(sectionString); - - String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM, - true); - if (wortart.contains("\n") || !wortart.contains("eutsch")) { - MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart, - AtomicInteger.class).incrementAndGet(); - continue; - } - - final LinkedHashMap> fieldToValue = new LinkedHashMap>(); - - wortart = wortart.replaceAll("===", ""); - wortart = WORTART.matcher(wortart).replaceAll("$1"); - wortart = GENDER.matcher(wortart).replaceAll("{$1}"); - wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1"); - wortart = wortart.replaceAll("Wortart\\|", ""); - wortart = wortart.trim(); - fieldToValue.put(Field.Wortart, Collections.singletonList(wortart)); - - String aussprache = StringUtil - .remove(section, AUSSPRACHE, NEWLINE, false); - if (aussprache != null) { - aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst(""); - aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1"); - aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", ""); - aussprache = aussprache.trim(); - fieldToValue.put(Field.Aussprache, Collections - .singletonList(aussprache)); - } - - for (final Field field : Field.values()) { - if (field.listPattern != null) { - fieldToValue.put(field, extractList(section, field.listPattern)); - } - } - - System.out.println(titleBuilder); - for (final Field field : Field.values()) { - if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) { - fieldToValue.remove(field); - } else { - if (field.name != null) { -// System.out.println(field.name); -// for (final String line : fieldToValue.get(field)) { -// System.out.println(" " + line); -// } - } - } - } -// System.out.println("WHAT'S LEFT:"); -// System.out.println(section); -// System.out.println("------------------------------------------------"); - - } - - } - - private List extractList(final StringBuilder section, - final Pattern start) { - final List result = new ArrayList(); - final String linesString = StringUtil.remove(section, start, - WIKI_NEW_SECTION, false); - if (linesString != null) { - String[] lines = linesString.split("\n"); - for (int i = 1; i < lines.length; ++i) { - String bedeutung = lines[i]; - bedeutung = bedeutung.replaceFirst("^:+", ""); - bedeutung = bedeutung.trim(); - if (bedeutung.length() > 0) { - result.add(bedeutung); - } - } - } - return result; - } - - void parse(final File file) throws ParserConfigurationException, - SAXException, IOException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - parser.parse(file, this); - System.out.println(errorCounts); - } - -} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index 6bb1115..7ea0d91 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -8,7 +8,14 @@ import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +import com.hughes.android.dictionary.parser.DictFileParser; +import com.hughes.android.dictionary.parser.EnWiktionaryXmlParser; import com.hughes.util.Args; import com.hughes.util.FileUtil; @@ -37,14 +44,13 @@ import com.hughes.util.FileUtil; public class DictionaryBuilder { - final Dictionary dictionary; - - final List indexBuilders = new ArrayList(); + public final Dictionary dictionary; + public final List indexBuilders = new ArrayList(); - public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1) { + public DictionaryBuilder(final String dictInfo, final Language lang0, final Language lang1, final String normalizerRules1, final String normalizerRules2) { dictionary = new Dictionary(dictInfo); - indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, false)); - indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, true)); + indexBuilders.add(new IndexBuilder(this, lang0.getSymbol(), lang0.getSymbol() + "->" + lang1.getSymbol(), lang0, normalizerRules1, false)); + indexBuilders.add(new IndexBuilder(this, lang1.getSymbol(), lang1.getSymbol() + "->" + lang0.getSymbol(), lang1, normalizerRules2, true)); } void build() { @@ -54,7 +60,7 @@ public class DictionaryBuilder { } } - public static void main(final String[] args) throws IOException { + public static void main(final String[] args) throws IOException, ParserConfigurationException, SAXException { final Map keyValueArgs = Args.keyValueArgs(args); final Language lang1 = Language.lookup(keyValueArgs.remove("lang1")); @@ -63,6 +69,15 @@ public class DictionaryBuilder { fatalError("--lang1= and --lang2= must both be specified."); } + String normalizerRules1 = keyValueArgs.remove("normalizerRules1"); + String normalizerRules2 = keyValueArgs.remove("normalizerRules2"); + if (normalizerRules1 == null) { + normalizerRules1 = lang1.getDefaultNormalizerRules(); + } + if (normalizerRules2 == null) { + normalizerRules2 = lang2.getDefaultNormalizerRules(); + } + final String dictOutFilename = keyValueArgs.remove("dictOut"); if (dictOutFilename == null) { fatalError("--dictOut= must be specified."); @@ -80,10 +95,12 @@ public class DictionaryBuilder { System.out.println("lang1=" + lang1); System.out.println("lang2=" + lang2); + System.out.println("normalizerRules1=" + normalizerRules1); + System.out.println("normalizerRules2=" + normalizerRules2); System.out.println("dictInfo=" + dictInfo); System.out.println("dictOut=" + dictOutFilename); - final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2); + final DictionaryBuilder dictionaryBuilder = new DictionaryBuilder(dictInfo, lang1, lang2, normalizerRules1, normalizerRules2); for (int i = 0; i < 100; ++i) { final String prefix = "input" + i; @@ -105,9 +122,15 @@ public class DictionaryBuilder { new DictFileParser(charset, false, DictFileParser.TAB, null, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); } else if ("chemnitz".equals(inputFormat)) { new DictFileParser(charset, false, DictFileParser.DOUBLE_COLON, DictFileParser.PIPE, dictionaryBuilder, dictionaryBuilder.indexBuilders.toArray(new IndexBuilder[0]), null).parseFile(file); - } else if ("wiktionary".equals(inputFormat)) { - throw new RuntimeException(); -// new WiktionaryXmlParser(dict).parse(file); + } else if ("enwiktionary".equals(inputFormat)) { + final Pattern[] translationPatterns = new Pattern[2]; + translationPatterns[0] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern1")); + translationPatterns[1] = Pattern.compile(keyValueArgs.remove(prefix + "TranslationPattern2")); + final int enIndex = Integer.parseInt(keyValueArgs.remove(prefix + "EnIndex")) - 1; + if (enIndex < 0 || enIndex >= 2) { + fatalError("Must be 1 or 2: " + prefix + "EnIndex"); + } + new EnWiktionaryXmlParser(dictionaryBuilder, translationPatterns, enIndex).parse(file); } else { fatalError("Invalid or missing input format: " + inputFormat); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index e68bf5e..7c8d232 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -11,8 +11,8 @@ import junit.framework.TestCase; public class DictionaryBuilderTest extends TestCase { - public void testGermanCombined() throws IOException { - final File result = new File("testdata/de_en.dict"); + public void testGermanCombined() throws Exception { + final File result = new File("testdata/de-en.quickdic"); System.out.println("Writing to: " + result); DictionaryBuilder.main(new String[] { "--dictOut=" + result.getAbsolutePath(), @@ -20,30 +20,37 @@ public class DictionaryBuilderTest extends TestCase { "--lang2=EN", "--dictInfo=@testdata/de-en_dictInfo.txt", - "--input1=testdata/de-en_chemnitz_100", - "--input1Name=dictcc", - "--input1Charset=UTF8", - "--input1Format=chemnitz", +// "--input1=testdata/de-en_chemnitz_100", +// "--input1Name=dictcc", +// "--input1Charset=UTF8", +// "--input1Format=chemnitz", +// +// "--input2=testdata/de-en_dictcc_100", +// "--input2Name=dictcc", +// "--input2Charset=UTF8", +// "--input2Format=dictcc", + + "--input3=testdata/enwiktionary_small.xml", + "--input3Name=enwiktionary", + "--input3Format=enwiktionary", + "--input3TranslationPattern1=[Gg]erman", + "--input3TranslationPattern2=[Ee]glish", + "--input3EnIndex=2", - "--input2=testdata/de-en_dictcc_100", - "--input2Name=dictcc", - "--input2Charset=UTF8", - "--input2Format=dictcc", - "--print=testdata/de-en.test", }); // Check it once: - assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); + assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test"); // Check it again. final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r")); - final PrintStream out = new PrintStream(new File("testdata/de_en.test")); + final PrintStream out = new PrintStream(new File("testdata/de-en.test")); dict.print(out); out.close(); - assertFilesEqual("testdata/de_en.golden", "testdata/de_en.test"); + assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test"); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java new file mode 100644 index 0000000..878a24f --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder_DE.java @@ -0,0 +1,40 @@ +package com.hughes.android.dictionary.engine; + +import junit.framework.TestCase; + +public class DictionaryBuilder_DE extends TestCase { + + public static void main(final String[] args) throws Exception { + + DictionaryBuilder.main(new String[] { + "--dictOut=dictOutputs/de-en_chemnitz.quickdic", + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@dictInputs/de-en_chemnitz.info", + + "--input1=dictInputs/de-en_chemnitz.txt", + "--input1Name=dictcc", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + }); + + DictionaryBuilder.main(new String[] { + "--dictOut=dictOutputs/de-en_all.quickdic", + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@dictInputs/de-en_all.info", + + "--input1=dictInputs/de-en_chemnitz.txt", + "--input1Name=dictcc", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + + "--input2=dictInputs/de-en_dictcc.txt", + "--input2Name=dictcc", + "--input2Charset=UTF8", + "--input2Format=dictcc", + }); + + } + +} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryTest.java b/src/com/hughes/android/dictionary/engine/DictionaryTest.java index 4b45348..388e71d 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryTest.java @@ -10,13 +10,14 @@ import java.util.concurrent.atomic.AtomicBoolean; import junit.framework.TestCase; -import com.hughes.android.dictionary.engine.Index.SearchResult; +import com.hughes.android.dictionary.engine.Index.IndexEntry; +import com.ibm.icu.text.Transliterator; public class DictionaryTest extends TestCase { public void testGermanMetadata() throws IOException { - final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r"); + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r"); final Dictionary dict = new Dictionary(raf); final Index deIndex = dict.indices.get(0); @@ -33,55 +34,51 @@ public class DictionaryTest extends TestCase { for (final Index.IndexEntry indexEntry : deIndex.sortedIndexEntries) { System.out.println("testing: " + indexEntry.token); - final Index.SearchResult searchResult = deIndex.findLongestSubstring(indexEntry.token, new AtomicBoolean( + final IndexEntry searchResult = deIndex.findInsertionPoint(indexEntry.token, new AtomicBoolean( false)); - assertEquals(indexEntry.token.toLowerCase(), searchResult.insertionPoint.token.toLowerCase()); - assertEquals(indexEntry.token.toLowerCase(), searchResult.longestPrefix.token.toLowerCase()); + assertEquals(indexEntry.token.toLowerCase(), searchResult.token.toLowerCase()); } // TODO: maybe if user types capitalization, use it. - assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aaac", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAC", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("AAAc", new AtomicBoolean(false))); - assertSearchResult("aaac", "aaac", deIndex.findLongestSubstring("aAac", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aaac", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAC", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("AAAc", new AtomicBoolean(false))); + assertSearchResult("aaac", "aaac", deIndex.findInsertionPoint("aAac", new AtomicBoolean(false))); // Before the beginning. - assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("", new AtomicBoolean(false))); - assertSearchResult("40", "40" /* special case */, deIndex.findLongestSubstring("__", new AtomicBoolean(false))); + assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("", new AtomicBoolean(false))); + assertSearchResult("40", "40" /* special case */, deIndex.findInsertionPoint("__", new AtomicBoolean(false))); // After the end. - assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findLongestSubstring("ZZZZZ", new AtomicBoolean(false))); - - assertSearchResult("ab", "aaac", deIndex.findLongestSubstring("aaaca", new AtomicBoolean(false))); - assertSearchResult("machen", "machen", deIndex.findLongestSubstring("m", new AtomicBoolean(false))); + assertSearchResult("Zweckorientiertheit", "zählen", deIndex.findInsertionPoint("ZZZZZ", new AtomicBoolean(false))); - assertFalse(deIndex.findLongestSubstring("macdddd", new AtomicBoolean(false)).success); + assertSearchResult("ab", "aaac", deIndex.findInsertionPoint("aaaca", new AtomicBoolean(false))); + assertSearchResult("machen", "machen", deIndex.findInsertionPoint("m", new AtomicBoolean(false))); + assertSearchResult("machen", "machen", deIndex.findInsertionPoint("macdddd", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberprüfe", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpruefe", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberprüfe", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpruefe", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("ueberpBLEH", new AtomicBoolean(false))); - assertSearchResult("überprüfe", "überprüfe", deIndex.findLongestSubstring("überprBLEH", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("ueberpBLEH", new AtomicBoolean(false))); + assertSearchResult("überprüfe", "überprüfe", deIndex.findInsertionPoint("überprBLEH", new AtomicBoolean(false))); - assertSearchResult("überprüfen", "überprüfe", deIndex.findLongestSubstring("überprüfeBLEH", new AtomicBoolean(false))); + assertSearchResult("überprüfen", "überprüfe", deIndex.findInsertionPoint("überprüfeBLEH", new AtomicBoolean(false))); // Check that search in lowercase works. - assertSearchResult("Alibi", "Alibi", deIndex.findLongestSubstring("alib", new AtomicBoolean(false))); - assertTrue(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).success); - System.out.println(deIndex.findLongestSubstring("alib", new AtomicBoolean(false)).toString()); + assertSearchResult("Alibi", "Alibi", deIndex.findInsertionPoint("alib", new AtomicBoolean(false))); + System.out.println(deIndex.findInsertionPoint("alib", new AtomicBoolean(false)).toString()); raf.close(); } private void assertSearchResult(final String insertionPoint, final String longestPrefix, - final SearchResult actual) { - assertEquals(insertionPoint, actual.insertionPoint.token); - assertEquals(longestPrefix, actual.longestPrefix.token); + final IndexEntry actual) { + assertEquals(insertionPoint, actual.token); } public void testGermanTokenRows() throws IOException { - final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.dict", "r"); + final RandomAccessFile raf = new RandomAccessFile("testdata/de-en.quickdic", "r"); final Dictionary dict = new Dictionary(raf); final Index deIndex = dict.indices.get(0); @@ -112,7 +109,8 @@ public class DictionaryTest extends TestCase { } public void testGermanSort() { - assertEquals("aüÄÄ", Language.de.textNorm("aueAeAE", false)); + final Transliterator normalizer = Transliterator.createFromRules("", Language.de.getDefaultNormalizerRules(), Transliterator.FORWARD); + assertEquals("aüääss", normalizer.transform("aueAeAEß")); final List words = Arrays.asList( "er-ben", "erben", @@ -129,32 +127,34 @@ public class DictionaryTest extends TestCase { "Großformats", "Großpoo", "Großpoos", + "Hörvermögen", "Hörweite", "hos", "Höschen", "Hostel", "hulle", "Hulle", - "hülle", "huelle", - "Hülle", "Huelle", + "hülle", + "Hülle", + "Huellen", + "Hüllen", "Hum" ); - assertEquals(0, Language.de.sortComparator.compare("hülle", "huelle")); - assertEquals(0, Language.de.sortComparator.compare("huelle", "hülle")); + final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.de.getCollator()); + assertEquals(1, comparator.compare("hülle", "huelle")); + assertEquals(-1, comparator.compare("huelle", "hülle")); - assertEquals(-1, Language.de.sortComparator.compare("hülle", "Hülle")); - assertEquals(0, Language.de.findComparator.compare("hülle", "Hülle")); - assertEquals(-1, Language.de.findComparator.compare("hulle", "Hülle")); + assertEquals(-1, comparator.compare("hülle", "Hülle")); + + assertEquals("hülle", normalizer.transform("Hülle")); + assertEquals("hulle", normalizer.transform("Hulle")); - for (final String s : words) { - System.out.println(s + "\t" + Language.de.textNorm(s, false)); - } final List sorted = new ArrayList(words); // Collections.shuffle(shuffled, new Random(0)); - Collections.sort(sorted, Language.de.sortComparator); + Collections.sort(sorted, comparator); System.out.println(sorted.toString()); for (int i = 0; i < words.size(); ++i) { System.out.println(words.get(i) + "\t" + sorted.get(i)); @@ -162,8 +162,8 @@ public class DictionaryTest extends TestCase { } } - @SuppressWarnings("unchecked") public void testEnglishSort() { + final Transliterator normalizer = Transliterator.createFromRules("", Language.en.getDefaultNormalizerRules(), Transliterator.FORWARD); final List words = Arrays.asList( "pre-print", @@ -172,16 +172,17 @@ public class DictionaryTest extends TestCase { "preprocess"); final List sorted = new ArrayList(words); - Collections.sort(sorted, Language.en.getSortCollator()); + final NormalizeComparator comparator = new NormalizeComparator(normalizer, Language.en.getCollator()); + Collections.sort(sorted, comparator); for (int i = 0; i < words.size(); ++i) { if (i > 0) { - assertTrue(Language.en.getSortCollator().compare(words.get(i-1), words.get(i)) < 0); + assertTrue(comparator.compare(words.get(i-1), words.get(i)) < 0); } System.out.println(words.get(i) + "\t" + sorted.get(i)); assertEquals(words.get(i), sorted.get(i)); } - assertTrue(Language.en.getSortCollator().compare("pre-print", "preppy") < 0); + assertTrue(comparator.compare("pre-print", "preppy") < 0); } @@ -192,17 +193,24 @@ public class DictionaryTest extends TestCase { } public void testTextNorm() { - assertEquals("hoschen", "Höschen".toLowerCase(Language.de.locale)); + //final Transliterator transliterator = Transliterator.getInstance("Any-Latin; Upper; Lower; 'oe' > 'o'; NFD; [:Nonspacing Mark:] Remove; NFC", Transliterator.FORWARD); + final Transliterator transliterator = Transliterator.createFromRules("", ":: Any-Latin; :: Upper; :: Lower; 'oe' > 'o'; :: NFD; :: [:Nonspacing Mark:] Remove; :: NFC ;", Transliterator.FORWARD); + assertEquals("hoschen", transliterator.transliterate("Höschen")); + assertEquals("hoschen", transliterator.transliterate("Hoeschen")); + assertEquals("grosspoo", transliterator.transliterate("Großpoo")); + + assertEquals("kyanpasu", transliterator.transliterate("キャンパス")); + assertEquals("alphabetikos katalogos", transliterator.transliterate("Αλφαβητικός Κατάλογος")); + assertEquals("biologiceskom", transliterator.transliterate("биологическом")); } public void testChemnitz() throws IOException { - final RandomAccessFile raf = new RandomAccessFile("testdata/de-en_chemnitz.dict", "r"); + final RandomAccessFile raf = new RandomAccessFile("dictOutputs/de-en_chemnitz.quickdic", "r"); final Dictionary dict = new Dictionary(raf); final Index deIndex = dict.indices.get(0); - //assertSearchResult("Höschen", "Hos", deIndex.findLongestSubstring("Hos", new AtomicBoolean(false))); - //assertSearchResult("Höschen", "hos", deIndex.findLongestSubstring("hos", new AtomicBoolean(false))); - + assertSearchResult("Höschen", "Hos", deIndex.findInsertionPoint("Hos", new AtomicBoolean(false))); + assertSearchResult("Höschen", "hos", deIndex.findInsertionPoint("hos", new AtomicBoolean(false))); raf.close(); } diff --git a/src/com/hughes/android/dictionary/engine/EntryData.java b/src/com/hughes/android/dictionary/engine/EntryData.java index 7f6b9b5..19521f2 100644 --- a/src/com/hughes/android/dictionary/engine/EntryData.java +++ b/src/com/hughes/android/dictionary/engine/EntryData.java @@ -5,8 +5,8 @@ package com.hughes.android.dictionary.engine; import com.hughes.util.IndexedObject; -class EntryData extends IndexedObject { - EntryData(final int index, final Entry entry) { +public class EntryData extends IndexedObject { + public EntryData(final int index, final Entry entry) { super(index); this.entry = entry; } diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 0e25e33..0d6a3d9 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -17,15 +17,14 @@ import com.hughes.android.dictionary.engine.Index.IndexEntry; public class IndexBuilder { final DictionaryBuilder dictionaryBuilder; - final Index index; + public final Index index; final SortedMap tokenToData; - @SuppressWarnings("unchecked") - IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final boolean swapPairEntries) { + IndexBuilder(final DictionaryBuilder dictionaryBuilder, final String shortName, final String longName, final Language language, final String normalizerRules, final boolean swapPairEntries) { this.dictionaryBuilder = dictionaryBuilder; - index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, swapPairEntries); - tokenToData = new TreeMap(language.getSortCollator()); + index = new Index(dictionaryBuilder.dictionary, shortName, longName, language, normalizerRules, swapPairEntries); + tokenToData = new TreeMap(new NormalizeComparator(index.normalizer, language.collator)); } public void build() { diff --git a/src/com/hughes/android/dictionary/engine/NormalizeComparator.java b/src/com/hughes/android/dictionary/engine/NormalizeComparator.java new file mode 100644 index 0000000..d25c5a4 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/NormalizeComparator.java @@ -0,0 +1,29 @@ +package com.hughes.android.dictionary.engine; + +import java.util.Comparator; + +import com.ibm.icu.text.Transliterator; + +public class NormalizeComparator implements Comparator { + + final Transliterator normalizer; + final Comparator comparator; + + public NormalizeComparator(final Transliterator normalizer, + final Comparator comparator) { + this.normalizer = normalizer; + this.comparator = comparator; + } + + @Override + public int compare(final String s1, final String s2) { + final String n1 = normalizer.transform(s1); + final String n2 = normalizer.transform(s2); + final int cn = comparator.compare(n1, n2); + if (cn != 0) { + return cn; + } + return comparator.compare(s1, s2); + } + +} diff --git a/src/com/hughes/android/dictionary/engine/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java similarity index 94% rename from src/com/hughes/android/dictionary/engine/DictFileParser.java rename to src/com/hughes/android/dictionary/parser/DictFileParser.java index ebdbaef..1e01ae2 100644 --- a/src/com/hughes/android/dictionary/engine/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -1,4 +1,4 @@ -package com.hughes.android.dictionary.engine; +package com.hughes.android.dictionary.parser; import java.io.BufferedReader; import java.io.File; @@ -11,6 +11,12 @@ import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.EntryData; +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.Language; +import com.hughes.android.dictionary.engine.PairEntry; import com.hughes.android.dictionary.engine.PairEntry.Pair; public class DictFileParser { @@ -18,11 +24,11 @@ public class DictFileParser { static final Logger logger = Logger.getLogger(DictFileParser.class.getName()); // Dictcc - static final Pattern TAB = Pattern.compile("\\t"); + public static final Pattern TAB = Pattern.compile("\\t"); // Chemnitz - static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); - static final Pattern PIPE = Pattern.compile("\\|"); + public static final Pattern DOUBLE_COLON = Pattern.compile(" :: "); + public static final Pattern PIPE = Pattern.compile("\\|"); static final Pattern SPACES = Pattern.compile("\\s+"); static final Pattern DE_NOUN = Pattern.compile("([^ ]+) *\\{(m|f|n|pl)\\}"); diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java new file mode 100644 index 0000000..677b5ee --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -0,0 +1,272 @@ +package com.hughes.android.dictionary.parser; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.IndexBuilder; + +public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback { + + final DictionaryBuilder dict; + + final IndexBuilder[] indexBuilders; + final Pattern[] langPatterns; + + StringBuilder titleBuilder; + StringBuilder textBuilder; + StringBuilder currentBuilder = null; + + public EnWiktionaryXmlParser(final DictionaryBuilder builder, final Pattern[] langPatterns, final int enIndexBuilder) { + assert langPatterns.length == 2; + this.dict = builder; + this.indexBuilders = dict.indexBuilders.toArray(new IndexBuilder[0]); + this.langPatterns = langPatterns; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) { + currentBuilder = null; + if ("page".equals(qName)) { + titleBuilder = new StringBuilder(); + + // Start with "\n" to better match certain strings. + textBuilder = new StringBuilder("\n"); + } else if ("title".equals(qName)) { + currentBuilder = titleBuilder; + } else if ("text".equals(qName)) { + currentBuilder = textBuilder; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (currentBuilder != null) { + currentBuilder.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + currentBuilder = null; + if ("page".equals(qName)) { + endPage(); + } + } + + + public void parse(final File file) throws ParserConfigurationException, + SAXException, IOException { + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + parser.parse(file, this); + } + + private void endPage() { + title = titleBuilder.toString(); + currentDepth = 0; + words.clear(); + WikiParser.parse(textBuilder.toString(), this); + } + + /** + * Two things can happen: + * + * We can be in a ==German== section. There we will see English definitions. + * Each POS should get its own QuickDic entry. Pretty much everything goes + * in. + * + * Or we can be in an ==English== section with English definitions + * and maybe see translations for languages we care about. + * + * In either case, we need to differentiate the subsections (Noun, Verb, etc.) + * into separate QuickDic entries, but that's tricky--how do we know when we + * found a subsection? Just ignore anything containing pronunciation and + * etymology? + * + * How do we decide when to seal the deal on an entry? + * + * Would be nice if the parser told us about leaving sections.... + * + * + */ + + String title; + int currentDepth; + final List words = new ArrayList(); + WikiWord currentWord; + WikiWord.PartOfSpeech currentPartOfSpeech; + WikiWord.TranslationSection currentTranslationSection; + + StringBuilder wikiBuilder = null; + + // ------------------------------------------------------------------------ + + @Override + public void onWikiLink(String[] args) { + if (wikiBuilder != null) { + wikiBuilder.append(args[args.length - 1]); + } + } + + @Override + public void onTemplate(String[][] args) { + final String name = args[0][1]; + if (name == "") { + + } else { + //System.out.println("Unhandled template: " + name); + } + } + + @Override + public void onText(String text) { + if (wikiBuilder != null) { + wikiBuilder.append(text); + return; + } + } + + @Override + public void onHeadingStart(int depth) { + wikiBuilder = new StringBuilder(); + currentDepth = depth; + if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { + currentPartOfSpeech = null; + } + if (currentWord != null && depth <= currentWord.depth) { + currentWord = null; + } + } + + final Pattern partOfSpeechHeader = Pattern.compile( + "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + + "Ligature|Idiom|Phrase|" + + // These are @deprecated: + "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb"); + + @Override + public void onHeadingEnd(int depth) { + final String name = wikiBuilder.toString().trim(); + wikiBuilder = null; + + final boolean lang1 = langPatterns[0].matcher(name).matches(); + final boolean lang2 = langPatterns[1].matcher(name).matches(); + if (name.equalsIgnoreCase("English") || lang1 || lang2) { + currentWord = new WikiWord(depth); + currentWord.language = name; + currentWord.isLang1 = lang1; + currentWord.isLang2 = lang2; + words.add(currentWord); + return; + } + + if (currentWord == null) { + return; + } + + if (partOfSpeechHeader.matcher(name).matches()) { + currentPartOfSpeech = new WikiWord.PartOfSpeech(depth); + currentWord.partsOfSpeech.add(currentPartOfSpeech); + return; + } + + if (name.equals("Translations")) { + if (currentWord == null || + !currentWord.language.equals("English") || + currentPartOfSpeech == null) { + System.out.println("Unexpected Translations section: " + title); + return; + } + currentTranslationSection = new WikiWord.TranslationSection(); + currentPartOfSpeech.translationSections.add(currentTranslationSection); + } else { + currentTranslationSection = null; + } + } + + @Override + public void onListItemStart(String header, int[] section) { + wikiBuilder = new StringBuilder(); + } + + + @Override + public void onListItemEnd(String header, int[] section) { + final String item = wikiBuilder.toString(); + wikiBuilder = null; + + if (currentTranslationSection != null) { + final int colonPos = item.indexOf(':'); + if (colonPos == -1) { + System.out.println("Invalid translation: " + item); + return; + } + final String lang = item.substring(0, colonPos); + final String trans = item.substring(colonPos + 1); + for (int i = 0; i < 2; ++i) { + if (langPatterns[i].matcher(lang).find()) { + currentTranslationSection.translations.get(i).add(trans); + } + } + } + } + + @Override + public void onNewLine() { + } + + @Override + public void onNewParagraph() { + } + + // ---------------------------------------------------------------------- + + public void onTransTrop(final String[][] args) { + currentTranslationSection = new WikiWord.TranslationSection(); + currentPartOfSpeech.translationSections.add(currentTranslationSection); + + if (args.length > 1) { + currentTranslationSection.sense = args[1][1]; + } + } + + + // ---------------------------------------------------------------------- + + @Override + public void onComment(String text) { + } + + @Override + public void onFormatBold(boolean boldOn) { + } + + @Override + public void onFormatItalic(boolean italicOn) { + } + + @Override + public void onUnterminated(String start, String rest) { + throw new RuntimeException(rest); + } + @Override + public void onInvalidHeaderEnd(String rest) { + throw new RuntimeException(rest); + } + +} diff --git a/src/com/hughes/android/dictionary/parser/WikiCallback.java b/src/com/hughes/android/dictionary/parser/WikiCallback.java new file mode 100644 index 0000000..44865cc --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/WikiCallback.java @@ -0,0 +1,33 @@ +package com.hughes.android.dictionary.parser; + + +public interface WikiCallback { + + void onComment(final String text); + + void onFormatBold(final boolean boldOn); + void onFormatItalic(final boolean italicOn); + + void onWikiLink(final String[] args); + + void onTemplate(final String[][] args); + + // Will never contain a newline unless it's in a
+  void onText(final String text);
+
+  // Only at start of line.
+  void onHeadingStart(final int depth);
+  void onHeadingEnd(final int depth);
+  
+  
+  void onNewLine();
+  void onNewParagraph();
+
+  void onListItemStart(final String header, final int[] section);
+  void onListItemEnd(final String header, final int[] section);
+
+  // Errors
+  void onUnterminated(final String start, String rest);
+  void onInvalidHeaderEnd(String rest);
+  
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiParser.java b/src/com/hughes/android/dictionary/parser/WikiParser.java
new file mode 100644
index 0000000..84dc770
--- /dev/null
+++ b/src/com/hughes/android/dictionary/parser/WikiParser.java
@@ -0,0 +1,128 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WikiParser {
+  
+  private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|^[*#;:]+|^(==+)\\s*|(==+)\\s*$|");
+          if (end == -1) {
+            callback.onUnterminated(" '''pretty''' cool '''''over''''' there." + "\n" +
+      "hi " + "\n" +
+      "" + "\n" +
+      "# li" + "\n" +
+      "# li2" + "\n" +
+      "## li2.2" + "\n" +
+      "Hi again." + "\n" +
+      "here's [[some blah|some]] wikitext." + "\n" +
+      "here's a {{template|blah=2|blah2=3|" + "\n" +
+      "blah3=3}} and some more text." + "\n" +
+      "== Header 2 ==" + "\n" +
+      "=== {{header-template}} ===" + "\n";
+    
+    final String expected = "Hi Hello thad you're \n" +
+        "comment: not \n" +
+        " pretty cool over there. hi \n" +
+        "comment:\n" +
+        "multi-line\n" +
+        "# comment \n" +
+        "\n" +
+        "\n" +
+        "# li\n" +
+        " # li2\n" +
+        " ## li2.2\n" +
+        " Hi again. here's [[some]] wikitext. here's a \n" +
+        "template:template\n" +
+        " and some more text. \n" +
+        "HEADER   Header 2 \n" +
+        " \n" +
+        "HEADER    \n" +
+        "template:header-template\n" +
+        " \n" +
+        " ";
+    final PrintWikiCallback callback = new PrintWikiCallback();
+    WikiParser.parse(text, callback);
+    assertEquals(expected, callback.builder.toString());
+    
+  }
+  
+  
+  static final class PrintWikiCallback implements WikiCallback {
+    final StringBuilder builder = new StringBuilder();
+
+    @Override
+    public void onComment(String text) {
+      builder.append("\ncomment:").append(text).append("\n");
+    }
+
+    @Override
+    public void onFormatBold(boolean boldOn) {
+      builder.append(boldOn ? "" : "");
+    }
+
+    @Override
+    public void onFormatItalic(boolean italicOn) {
+      builder.append(italicOn ? "" : "");
+    }
+
+    @Override
+    public void onWikiLink(String[] args) {
+      builder.append("[[").append(args[args.length - 1]).append("]]");
+    }
+
+    @Override
+    public void onTemplate(String[][] args) {
+      builder.append("\ntemplate:").append(args[0][0]).append("\n");
+    }
+
+    @Override
+    public void onText(String text) {
+      builder.append(text);
+    }
+
+    @Override
+    public void onHeadingStart(int depth) {
+      builder.append("\nHEADER");
+      for (int i = 0; i < depth; ++i) {
+        builder.append(" ");
+      }
+    }
+
+    @Override
+    public void onHeadingEnd(int depth) {
+      builder.append("\n");
+    }
+    
+    @Override
+    public void onNewLine() {
+      builder.append(" ");
+    }
+
+    @Override
+    public void onNewParagraph() {
+      builder.append("\n\n");
+    }
+
+    @Override
+    public void onListItemStart(String header, int[] section) {
+      builder.append(header);
+    }
+
+    @Override
+    public void onListItemEnd(String header, int[] section) {
+      builder.append("\n");
+    }
+
+    @Override
+    public void onUnterminated(String start, String rest) {
+      throw new RuntimeException("bad");
+    }
+
+    @Override
+    public void onInvalidHeaderEnd(String rest) {
+      throw new RuntimeException("bad");
+    }
+    
+  }
+  
+
+
+}
diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java b/src/com/hughes/android/dictionary/parser/WikiWord.java
new file mode 100644
index 0000000..49806d2
--- /dev/null
+++ b/src/com/hughes/android/dictionary/parser/WikiWord.java
@@ -0,0 +1,58 @@
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class WikiWord {
+  final int depth;
+  
+  String language;
+  String pronunciation;
+
+  boolean isLang1;
+  boolean isLang2;
+  
+  final List partsOfSpeech = new ArrayList();
+  
+  final Map> otherSections = new LinkedHashMap>();
+  
+  public WikiWord(int depth) {
+    this.depth = depth;
+  }
+
+  static class PartOfSpeech {
+    final int depth;
+
+    final List meaning = new ArrayList();
+    
+    final List translationSections = new ArrayList();
+        
+    final Map otherSections = new LinkedHashMap();
+
+    public PartOfSpeech(final int depth) {
+      this.depth = depth;
+    }
+  }
+  
+  static class TranslationSection {
+    String sense;
+    List> translations = new ArrayList>();
+    {
+      translations.add(new ArrayList());
+      translations.add(new ArrayList());
+    }
+  }
+  
+  static class Meaning {
+    String meaning;
+    Example example;
+  }
+  
+  static class Example {
+    String example;
+    String exampleInEnglish;
+  }
+
+}