From: Thad Hughes Date: Fri, 23 Jul 2010 21:23:14 +0000 (-0700) Subject: go X-Git-Url: http://gitweb.fperrin.net/?a=commitdiff_plain;h=d6713198e0fa79ef24d8ec29691108cb58f88d2b;p=DictionaryPC.git go --- diff --git a/src/com/hughes/android/dictionary/DictionaryBuilder.java b/src/com/hughes/android/dictionary/DictionaryBuilder.java index 0f6317f..1e76822 100755 --- a/src/com/hughes/android/dictionary/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/DictionaryBuilder.java @@ -97,8 +97,8 @@ public class DictionaryBuilder { System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs); } - createIndex(dict, Entry.LANG1); - createIndex(dict, Entry.LANG2); + createIndex(dict, SimpleEntry.LANG1); + createIndex(dict, SimpleEntry.LANG2); System.out.println("Writing dictionary."); final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw"); @@ -110,7 +110,7 @@ public class DictionaryBuilder { for (byte lang = 0; lang < 2; ++lang) { final LanguageData languageData = dict.languageDatas[lang]; System.out.println("\nRandom words for: " + languageData.language.getSymbol()); - for (int i = 0; i < 10; ++i) { + for (int i = 0; i < 20; ++i) { final int w = random.nextInt(languageData.sortedIndex.size()); final IndexEntry entry = languageData.sortedIndex.get(w); final List rows = languageData.rows; @@ -145,7 +145,7 @@ public class DictionaryBuilder { continue; } - final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries); + final SimpleEntry entry = SimpleEntry.parseFromLine(line, hasMultipleSubentries); if (entry == null) { System.err.println("Invalid entry: " + line); continue; @@ -167,7 +167,7 @@ public class DictionaryBuilder { final Map tokenToData = new TreeMap(dict.languageDatas[lang].language.sortComparator); for (int e = 0; e < dict.entries.size(); ++e) { - final Entry entry = dict.entries.get(e); + final SimpleEntry entry = dict.entries.get(e); final Set tokens = entry.getIndexableTokens(lang); for (final String token : tokens) { TokenData tokenData = tokenToData.get(token); @@ -217,7 +217,7 @@ public class DictionaryBuilder { static final class TokenEntryData implements Comparable { final String token; - final Entry entry; + final SimpleEntry entry; final int entryIndex; private static final int bigNoOverflow = 100000; @@ -226,7 +226,7 @@ public class DictionaryBuilder { int minSubEntryLength = bigNoOverflow; int minSubEntry = bigNoOverflow; - public TokenEntryData(final byte lang, final String token, final Entry entry, final int entryIndex) { + public TokenEntryData(final byte lang, final String token, final SimpleEntry entry, final int entryIndex) { this.token = token; this.entry = entry; this.entryIndex = entryIndex; diff --git a/src/com/hughes/android/dictionary/DictionaryTest.java b/src/com/hughes/android/dictionary/DictionaryTest.java index 6e18a93..af770e1 100755 --- a/src/com/hughes/android/dictionary/DictionaryTest.java +++ b/src/com/hughes/android/dictionary/DictionaryTest.java @@ -24,19 +24,19 @@ public class DictionaryTest extends TestCase { file.deleteOnExit(); // final Dictionary goldenDict; - final List entries = Arrays.asList( - Entry.parseFromLine("der Hund :: the dog", false), - Entry.parseFromLine("Die grosse Katze :: The big cat", false), - Entry.parseFromLine("die Katze :: the cat", false), - Entry.parseFromLine("gross :: big", false), - Entry.parseFromLine("Dieb :: thief", false), - Entry.parseFromLine("rennen :: run", false)); + final List entries = Arrays.asList( + SimpleEntry.parseFromLine("der Hund :: the dog", false), + SimpleEntry.parseFromLine("Die grosse Katze :: The big cat", false), + SimpleEntry.parseFromLine("die Katze :: the cat", false), + SimpleEntry.parseFromLine("gross :: big", false), + SimpleEntry.parseFromLine("Dieb :: thief", false), + SimpleEntry.parseFromLine("rennen :: run", false)); { final Dictionary dict = new Dictionary("test", Language.de, Language.en); dict.entries.addAll(entries); - DictionaryBuilder.createIndex(dict, Entry.LANG1); - DictionaryBuilder.createIndex(dict, Entry.LANG2); + DictionaryBuilder.createIndex(dict, SimpleEntry.LANG1); + DictionaryBuilder.createIndex(dict, SimpleEntry.LANG2); final RandomAccessFile raf = new RandomAccessFile(file, "rw"); dict.write(raf); raf.close(); @@ -98,22 +98,22 @@ public class DictionaryTest extends TestCase { public void testTextNorm() throws IOException { System.out.println("\n\ntestTextNorm"); - final List entries = Arrays.asList( - Entry.parseFromLine("Hund {m} :: dog", true), - Entry.parseFromLine("'CHRISTOS' :: doh", true), - Entry.parseFromLine("\"Pick-up\"-Presse {f} :: baler", true), - Entry.parseFromLine("(Ach was), echt? [auch ironisch] :: No shit! [also ironic]", true), - Entry.parseFromLine("(akuter) Myokardinfarkt {m} :: (acute) myocardial infarction ", true), - Entry.parseFromLine("(reine) Vermutung {f} :: guesswork", true), - Entry.parseFromLine("(mit) 6:1 vorne liegen :: to be 6-1 up [football]", true), - Entry.parseFromLine("(auf) den Knopf drücken [auch fig.: auslösen] :: to push the button [also fig.: initiate]", false), - Entry.parseFromLine("Adjektiv {n} /Adj./; Eigenschaftswort {n} [gramm.] | Adjektive {pl}; Eigenschaftswoerter {pl} :: adjective /adj./ | adjectives", true), - Entry.parseFromLine("Älteste {m,f}; Ältester :: oldest; eldest", true), - Entry.parseFromLine("\"...\", schloss er an. :: '...,' he added.", true), - Entry.parseFromLine("besonderer | besondere | besonderes :: extra", false), - Entry.parseFromLine("| zu Pferde; zu Pferd | reiten :: horseback | on horseback | go on horseback", true), - Entry.parseFromLine("Hauptaugenmerk {m} | sein Hauptaugenmerk richten auf :: | to focus (one's) attention on", true), - Entry.parseFromLine("σ-Algebra {f} :: σ-field", true) + final List entries = Arrays.asList( + SimpleEntry.parseFromLine("Hund {m} :: dog", true), + SimpleEntry.parseFromLine("'CHRISTOS' :: doh", true), + SimpleEntry.parseFromLine("\"Pick-up\"-Presse {f} :: baler", true), + SimpleEntry.parseFromLine("(Ach was), echt? [auch ironisch] :: No shit! [also ironic]", true), + SimpleEntry.parseFromLine("(akuter) Myokardinfarkt {m} :: (acute) myocardial infarction ", true), + SimpleEntry.parseFromLine("(reine) Vermutung {f} :: guesswork", true), + SimpleEntry.parseFromLine("(mit) 6:1 vorne liegen :: to be 6-1 up [football]", true), + SimpleEntry.parseFromLine("(auf) den Knopf drücken [auch fig.: auslösen] :: to push the button [also fig.: initiate]", false), + SimpleEntry.parseFromLine("Adjektiv {n} /Adj./; Eigenschaftswort {n} [gramm.] | Adjektive {pl}; Eigenschaftswoerter {pl} :: adjective /adj./ | adjectives", true), + SimpleEntry.parseFromLine("Älteste {m,f}; Ältester :: oldest; eldest", true), + SimpleEntry.parseFromLine("\"...\", schloss er an. :: '...,' he added.", true), + SimpleEntry.parseFromLine("besonderer | besondere | besonderes :: extra", false), + SimpleEntry.parseFromLine("| zu Pferde; zu Pferd | reiten :: horseback | on horseback | go on horseback", true), + SimpleEntry.parseFromLine("Hauptaugenmerk {m} | sein Hauptaugenmerk richten auf :: | to focus (one's) attention on", true), + SimpleEntry.parseFromLine("σ-Algebra {f} :: σ-field", true) ); assertFalse(entries.contains(null)); @@ -122,8 +122,8 @@ public class DictionaryTest extends TestCase { final Dictionary dict = new Dictionary("test", Language.de, Language.en); dict.entries.addAll(entries); - DictionaryBuilder.createIndex(dict, Entry.LANG1); - DictionaryBuilder.createIndex(dict, Entry.LANG2); + DictionaryBuilder.createIndex(dict, SimpleEntry.LANG1); + DictionaryBuilder.createIndex(dict, SimpleEntry.LANG2); for (int lang = 0; lang <= 1; lang++) { final LanguageData languageData = dict.languageDatas[lang]; diff --git a/src/com/hughes/android/dictionary/WiktionaryXmlParser.java b/src/com/hughes/android/dictionary/WiktionaryXmlParser.java index 9df25c8..3ed4617 100644 --- a/src/com/hughes/android/dictionary/WiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/WiktionaryXmlParser.java @@ -9,7 +9,6 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; @@ -80,22 +79,40 @@ public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler { .compile("\\{\\{([^}]+)\\}\\}"); private static final Pattern WIKI_DOUBLE_BRACKET = Pattern .compile("\\[\\[([^\\]]+)\\]\\]"); - private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^="); + private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE); enum Field { - Wortart("Wortart", null), Aussprache("Aussprache", null), Bedeutungen( - "Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), Synonome( - "Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), Gegenworte( - "Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), Oberbegriffe( - "Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), Unterbegriffe( - "Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), Beispiele( - "Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), Redewendungen( - "Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), CharakteristischeWortkombinationen( - "Charakteristische Wortkombinationen", Pattern - .compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), AbgeleiteteBegriffe( - "Abgeleitete Begriffe", Pattern - .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), Herkunft("Herkunft", - Pattern.compile("\\{\\{Herkunft\\}\\}")); + Wortart("Wortart", null), + + Aussprache("Aussprache", null), + + Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), + + Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")), + + Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), + + Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), + + Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), + + Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), + + Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), + + Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), + + CharakteristischeWortkombinationen("Charakteristische Wortkombinationen", + Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), + + AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern + .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), + + Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")), + + Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")), + + ; final String name; final Pattern listPattern; @@ -151,7 +168,7 @@ public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler { if (aussprache != null) { aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst(""); aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1"); - aussprache = aussprache.replaceAll("Lautschrift\\|", ""); + aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", ""); aussprache = aussprache.trim(); fieldToValue.put(Field.Aussprache, Collections .singletonList(aussprache)); @@ -165,51 +182,35 @@ public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler { System.out.println(titleBuilder); for (final Field field : Field.values()) { - if (fieldToValue.get(field).isEmpty()) { + if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) { fieldToValue.remove(field); } else { - System.out.println(field.name); - for (final String line : fieldToValue.get(field)) { - System.out.println(" " + line); + if (field.name != null) { +// System.out.println(field.name); +// for (final String line : fieldToValue.get(field)) { +// System.out.println(" " + line); +// } } } } - System.out.println("WHAT'S LEFT:"); - System.out.println(section); - System.out.println("------------------------------------------------"); +// System.out.println("WHAT'S LEFT:"); +// System.out.println(section); +// System.out.println("------------------------------------------------"); } - // System.out.println(titleBuilder); - /* - * final List pronunciations = new ArrayList(); final - * CharSequence pronunciationSeq = getSection(text, PRONUNCIATION, - * SECTION_START); if (pronunciationSeq != null) { final Matcher - * pronunciationMatcher = PRONUNCIATION_EXAMPLE.matcher(pronunciationSeq); - * while (pronunciationMatcher.find()) { - * pronunciations.add(pronunciationMatcher.group(1)); } - * System.out.println("PRONUNCIATIONS:" + pronunciations); } - * - * String[] meanings = null; final CharSequence meaningsSeq = - * getSection(text, MEANINGS, SECTION_START); if (meaningsSeq != null) { - * meanings = LIST.split(meaningsSeq); meanings[0] = ""; - * System.out.println("MEANINGS:" + Arrays.toString(meanings)); } - * - * System.out.println(text); - */ - } private List extractList(final StringBuilder section, final Pattern start) { final List result = new ArrayList(); final String linesString = StringUtil.remove(section, start, - WIKI_DOUBLE_BRACE, false); + WIKI_NEW_SECTION, false); if (linesString != null) { String[] lines = linesString.split("\n"); for (int i = 1; i < lines.length; ++i) { String bedeutung = lines[i]; - bedeutung = bedeutung.replaceFirst("^:", ""); + bedeutung = bedeutung.replaceFirst("^:+", ""); bedeutung = bedeutung.trim(); if (bedeutung.length() > 0) { result.add(bedeutung); @@ -219,19 +220,6 @@ public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler { return result; } - private static CharSequence getSection(CharSequence input, Pattern start, - Pattern end) { - Matcher startMatcher = start.matcher(input); - if (!startMatcher.find()) { - return null; - } - Matcher endMatcher = end.matcher(input); - if (!endMatcher.find(startMatcher.end())) { - return input.subSequence(startMatcher.start(), input.length()); - } - return input.subSequence(startMatcher.start(), endMatcher.start()); - } - void parse(final File file) throws ParserConfigurationException, SAXException, IOException { final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();