enwiktionary.
echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/"
CHEMNITZ=de-en.txt
-#curl --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz
-#gunzip ${CHEMNITZ}.gz
-#mv ${CHEMNITZ} inputs/
+curl --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz
+gunzip ${CHEMNITZ}.gz
+mv ${CHEMNITZ} inputs/
echo "Note that unzipping is slow."
L=en
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
-WIKI=${L}wiktionary-20120109-pages-articles.xml
-#curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120109/${WIKI}.bz2
-#bunzip2 ${WIKI}.bz2
-#mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+WIKI=${L}wiktionary-20120220-pages-articles.xml
+curl --remote-name http://dumps.wikimedia.org/${L}wiktionary/20120220/${WIKI}.bz2
+bunzip2 ${WIKI}.bz2
+mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+
+exit
L=fr
echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
//isoToWikiName.clear();
boolean go = false;
for (final String foreignIso : isoToWikiName.keySet()) {
- if (foreignIso.equals("BO")) {
+ if (foreignIso.equals("GD")) {
go = true;
}
if (!go) {
String.format("--lang2Stoplist=%s", STOPLISTS + isoToStoplist.get(foreignIso)),
String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.\n\n%s", foreignIso, isoToDedication.get(foreignIso)),
- "--input2=" + INPUTS + "eikiSplit/en/" + foreignIso + ".data",
+ "--input2=" + INPUTS + "wikiSplit/en/" + foreignIso + ".data",
"--input2Name=enwiktionary." + foreignIso,
"--input2Format=enwiktionary",
+ "--input2WiktionaryType=EnForeign",
"--input2LangPattern=" + isoToRegex.get(foreignIso),
"--input2LangCodePattern=" + foreignIso.toLowerCase(),
"--input2EnIndex=1",
"--input3=" + INPUTS + "wikiSplit/en/EN.data",
"--input3Name=enwiktionary.english",
"--input3Format=enwiktionary",
+ "--input3WiktionaryType=EnToTranslation",
"--input3LangPattern=" + isoToRegex.get(foreignIso),
"--input3LangCodePattern=" + foreignIso.toLowerCase(),
"--input3EnIndex=1",
"--input2=" + INPUTS + "wikiSplit/en/DE.data",
"--input2Name=enwiktionary.DE",
"--input2Format=enwiktionary",
+ "--input2WiktionaryType=EnForeign",
"--input2LangPattern=German",
"--input2LangCodePattern=de",
"--input2EnIndex=2",
"--input3=" + INPUTS + "wikiSplit/en/EN.data",
"--input3Name=enwiktionary.english",
"--input3Format=enwiktionary",
+ "--input3WiktionaryType=EnToTranslation",
"--input3LangPattern=German",
"--input3LangCodePattern=de",
"--input3EnIndex=2",
static final Pattern PARENTHESIZED = Pattern.compile("\\(([^)]+)\\)");
static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
+ // http://www.regular-expressions.info/unicode.html
static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}\\p{M}\\p{N}]+");
public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}\\p{M}\\p{N}]+");
}
public WikiTokenizer(final String wikiText, final boolean isNewline) {
- this.wikiText = wikiText;
+ this.wikiText = wikiText.replaceAll("\u2028", "\n");
this.matcher = wikiTokenEvent.matcher(wikiText);
justReturnedNewline = isNewline;
}
}
}
+ public List<String> errors() {
+ return errors;
+ }
+
public boolean isNewline() {
return justReturnedNewline;
}
assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
if (matchText.length() == 0) {
- assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
+ assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
if (firstNewline == -1) {
firstNewline = matcher.end();
}
assertEquals("* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}", new WikiTokenizer(wikiText).nextToken().token());
assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
assertEquals("\n", new WikiTokenizer(wikiText).nextToken().nextToken().token());
+
+ wikiText = "* [[asdf|\u2028" +
+ "asdf]]";
+ assertEquals("* [[asdf|\n" +
+ "asdf]]", new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
+
}
public void testFunction() {
// Do nothing.
} else {
LOG.warning("Unexpected token: " + wikiTokenizer.token());
- assert false;
+ assert !wikiTokenizer.errors().isEmpty();
}
}
+rebuild dictionaries with bug fix
+make zip files
+rebuild index (for comparison), check it in
+download latest wiktionaries
+rebuild dictionaries.
+rebuild Check
+publish.
+
+
for i in res/raw*/*.html; do echo $i; tidy --input-encoding utf8 --output-file $i $i; done
SpannableText persisted class with a list of spans with span types. (might need its own builder.)