From b0334b661de0671389468c8f13f90b4108cd6f00 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Tue, 20 Dec 2011 09:39:57 -0800 Subject: [PATCH] Initialism, changes in regex matching. --- bugs | 1 + .../dictionary/engine/DictionaryBuilderMain.java | 14 ++++++++------ .../dictionary/engine/DictionaryBuilderTest.java | 1 - .../dictionary/parser/EnWiktionaryXmlParser.java | 11 +++++++---- .../android/dictionary/parser/WikiTokenizer.java | 1 + 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/bugs b/bugs index 6e0af80..42931c4 100644 --- a/bugs +++ b/bugs @@ -8,6 +8,7 @@ PC: sub-levels in translations. handle word-info in English. italian verbs... (show conjugation, pulled from a linked place....) +add unit test for: Errors: [Unmatched {{ error: * {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}} diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 20bf4ba..86c91e8 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -18,6 +18,7 @@ import java.io.File; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; @@ -44,7 +45,6 @@ public class DictionaryBuilderMain extends TestCase { // German handled in file. isoToDedication.put("EL", "Greek dictionary dedicated to Noah Egge."); isoToDedication.put("IT", "Italian dictionary dedicated to Carolina Tropini, my favorite stardust in the whole universe! Ti amo!"); - isoToDedication.put("JA", "Japanese dictionary dedicated to Akane Watanabe."); isoToDedication.put("KO", "Korean dictionary dedicated to Ande Elwood--fall fashion und Fernsehturms!"); isoToDedication.put("PT", "Portuguese dictionary dedicated to Carlos Melo, one Tough Mudder."); isoToDedication.put("RO", "Romanian dictionary dedicated to Radu Teodorescu."); @@ -61,12 +61,14 @@ public class DictionaryBuilderMain extends TestCase { isoToStoplist.put("FR", "fr.txt"); final Map isoToRegex = new LinkedHashMap(); - isoToRegex.put("ZH", ".*Chinese.*|.*Mandarin.*|.*Cantonese.*"); + isoToRegex.put("ZH", "Chinese|Mandarin|Cantonese"); - boolean go = false; - isoToWikiName.clear(); + isoToWikiName.keySet().retainAll(Arrays.asList("UK", "HR", "FI")); + + boolean go = true; +// isoToWikiName.clear(); for (final String foreignIso : isoToWikiName.keySet()) { - if (foreignIso.equals("GA")) { + if (foreignIso.equals("SV")) { go = true; } if (!go) { @@ -83,7 +85,7 @@ public class DictionaryBuilderMain extends TestCase { isoToDedication.put(foreignIso, ""); } if (!isoToRegex.containsKey(foreignIso)) { - isoToRegex.put(foreignIso, ".*" + isoToWikiName.get(foreignIso) + ".*"); + isoToRegex.put(foreignIso, isoToWikiName.get(foreignIso)); } DictionaryBuilder.main(new String[] { diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 99e4e84..8f79c36 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -43,7 +43,6 @@ public class DictionaryBuilderTest extends TestCase { wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt", "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantonese", "zh"); } - // German public void testWiktionary_DE_DE() throws Exception { diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index a7bf170..9c867a0 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -46,7 +46,7 @@ public class EnWiktionaryXmlParser { "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + - "Ligature|Idiom|Phrase|" + + "Ligature|Idiom|Phrase|{{initialism}}|" + // These are @deprecated: "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + @@ -201,6 +201,7 @@ public class EnWiktionaryXmlParser { // TODO: would also be nice... } else if (functionName.startsWith("picdic")) { } else if (functionName.startsWith("checktrans")) { + done = true; } else if (functionName.startsWith("ttbc")) { wikiTokenizer.nextLine(); // TODO: would be great to handle ttbc @@ -213,7 +214,7 @@ public class EnWiktionaryXmlParser { // This line could produce an output... if (line.contains("ich hoan dich gear")) { - System.out.println(); + //System.out.println(); } // First strip the language and check whether it matches. @@ -704,14 +705,16 @@ public class EnWiktionaryXmlParser { pairEntry.pairs.add(pair); } } else if (nextPrefix.equals("#::") || nextPrefix.equals("#**")) { - if (lastForeign != null) { + if (lastForeign != null && pairEntry.pairs.size() > 0) { pairEntry.pairs.remove(pairEntry.pairs.size() - 1); final Pair pair = new Pair(formatAndIndexExampleString(nextLine, enIndexBuilder, indexedEntry), formatAndIndexExampleString(lastForeign, otherIndexBuilder, indexedEntry), swap); if (pair.lang1 != "--" && pair.lang1 != "--") { pairEntry.pairs.add(pair); } + lastForeign = null; } else { - LOG.warning("English example with no foreign: " + title + ", " + nextLine); + LOG.warning("TODO: English example with no foreign: " + title + ", " + nextLine); + // TODO: add something. } } else if (nextPrefix.equals("#*")) { // Can't really index these. diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 403b27c..e12185b 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -173,6 +173,7 @@ public final class WikiTokenizer { if (lastUnescapedPipePos != -1) { return wikiText.substring(lastUnescapedPipePos + 1, end - 2); } + assert start + 2 < wikiText.length() && end >= 2: wikiText; return wikiText.substring(start + 2, end - 2); } -- 2.43.0