From: Thad Hughes Date: Thu, 8 Mar 2012 17:15:50 +0000 (-0800) Subject: Bug-fixes to WikiTokenizer (handle weird line-feed), update to newest X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=ebed9a0fa85aae350c4d2af0b48dda5fa7b23db9 Bug-fixes to WikiTokenizer (handle weird line-feed), update to newest enwiktionary. --- diff --git a/data/downloadInputs.sh b/data/downloadInputs.sh index f0863bf..c78c127 100755 --- a/data/downloadInputs.sh +++ b/data/downloadInputs.sh @@ -6,18 +6,20 @@ cd $DIR echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/" CHEMNITZ=de-en.txt -#curl --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz -#gunzip ${CHEMNITZ}.gz -#mv ${CHEMNITZ} inputs/ +curl --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz +gunzip ${CHEMNITZ}.gz +mv ${CHEMNITZ} inputs/ echo "Note that unzipping is slow." L=en echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" -WIKI=${L}wiktionary-20120109-pages-articles.xml -#curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120109/${WIKI}.bz2 -#bunzip2 ${WIKI}.bz2 -#mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +WIKI=${L}wiktionary-20120220-pages-articles.xml +curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120220/${WIKI}.bz2 +bunzip2 ${WIKI}.bz2 +mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml + +exit L=fr echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 0731ffb..d99b59c 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -65,7 +65,7 @@ public class DictionaryBuilderMain extends TestCase { //isoToWikiName.clear(); boolean go = false; for (final String foreignIso : isoToWikiName.keySet()) { - if (foreignIso.equals("BO")) { + if (foreignIso.equals("GD")) { go = true; } if (!go) { @@ -93,9 +93,10 @@ public class DictionaryBuilderMain extends TestCase { String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)), String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.\n\n%s", foreignIso, isoToDedication.get(foreignIso)), - "--input2=" + INPUTS + "eikiSplit/en/" + foreignIso + ".data", + "--input2=" + INPUTS + "wikiSplit/en/" + foreignIso + ".data", "--input2Name=enwiktionary." + foreignIso, "--input2Format=enwiktionary", + "--input2WiktionaryType=EnForeign", "--input2LangPattern=" + isoToRegex.get(foreignIso), "--input2LangCodePattern=" + foreignIso.toLowerCase(), "--input2EnIndex=1", @@ -103,6 +104,7 @@ public class DictionaryBuilderMain extends TestCase { "--input3=" + INPUTS + "wikiSplit/en/EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", + "--input3WiktionaryType=EnToTranslation", "--input3LangPattern=" + isoToRegex.get(foreignIso), "--input3LangCodePattern=" + foreignIso.toLowerCase(), "--input3EnIndex=1", @@ -128,6 +130,7 @@ public class DictionaryBuilderMain extends TestCase { "--input2=" + INPUTS + "wikiSplit/en/DE.data", "--input2Name=enwiktionary.DE", "--input2Format=enwiktionary", + "--input2WiktionaryType=EnForeign", "--input2LangPattern=German", "--input2LangCodePattern=de", "--input2EnIndex=2", @@ -135,6 +138,7 @@ public class DictionaryBuilderMain extends TestCase { "--input3=" + INPUTS + "wikiSplit/en/EN.data", "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", + "--input3WiktionaryType=EnToTranslation", "--input3LangPattern=German", "--input3LangCodePattern=de", "--input3EnIndex=2", diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 991ed8a..edc0ce0 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -55,6 +55,7 @@ public class DictFileParser implements Parser { static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)"); static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); + // http://www.regular-expressions.info/unicode.html static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+"); public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+"); diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 5ac7d45..493abf2 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -82,7 +82,7 @@ public final class WikiTokenizer { } public WikiTokenizer(final String wikiText, final boolean isNewline) { - this.wikiText = wikiText; + this.wikiText = wikiText.replaceAll("\u2028", "\n"); this.matcher = wikiTokenEvent.matcher(wikiText); justReturnedNewline = isNewline; } @@ -150,6 +150,10 @@ public final class WikiTokenizer { } } + public List errors() { + return errors; + } + public boolean isNewline() { return justReturnedNewline; } @@ -419,7 +423,7 @@ public final class WikiTokenizer { assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); if (matchText.length() == 0) { - assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; + assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart; if (firstNewline == -1) { firstNewline = matcher.end(); } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java index 89e4c99..8e1d5a8 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java @@ -60,7 +60,14 @@ public class WikiTokenizerTest extends TestCase { assertEquals("* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}", new WikiTokenizer(wikiText).nextToken().token()); assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem()); assertEquals("\n", new WikiTokenizer(wikiText).nextToken().nextToken().token()); + + wikiText = "* [[asdf|\u2028" + + "asdf]]"; + assertEquals("* [[asdf|\n" + + "asdf]]", new WikiTokenizer(wikiText).nextToken().token()); + assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem()); + } public void testFunction() { diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index 4f5d362..a189e58 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -268,6 +268,4 @@ public abstract class AbstractWiktionaryParser implements Parser { } } - - } diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java index ec0350e..bee194a 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/EnForeignParser.java @@ -163,7 +163,7 @@ public final class EnForeignParser extends EnParser { // Do nothing. } else { LOG.warning("Unexpected token: " + wikiTokenizer.token()); - assert false; + assert !wikiTokenizer.errors().isEmpty(); } } diff --git a/todo.txt b/todo.txt index 80302ef..5400ff9 100644 --- a/todo.txt +++ b/todo.txt @@ -1,3 +1,12 @@ +rebuild dictionaries with bug fix +make zip files +rebuild index (for comparison), check it in +download latest wiktionaries +rebuild dictionaries. +rebuild Check +publish. + + for i in res/raw*/*.html; do echo $i; tidy --input-encoding utf8 --output-file $i $i; done SpannableText persisted class with a list of spans with span types. (might need its own builder.)