From: Thad Hughes Date: Thu, 24 Nov 2011 21:27:14 +0000 (-0800) Subject: go X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=50408cbdec646b586a7c54d0d0d86e807d6c9657 go --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..23178b2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +dictInputs +dictOutputs/ +bin diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index 693db6c..928047d 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -41,7 +41,7 @@ public class DictionaryBuilderTest extends TestCase { }); // Check it once: - assertFilesEqual("testdata/wiktionary.it.golden", "testdata/wiktionary.it.test"); + assertFilesEqual("testdata/wiktionary.it.golden2", "testdata/wiktionary.it.test"); // Check it again. diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index 95e910d..6e11e0e 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -6,12 +6,16 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; import java.util.Set; import java.util.regex.Pattern; -import com.hughes.android.dictionary.engine.DictionaryBuilder; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; @@ -20,6 +24,9 @@ import com.hughes.android.dictionary.engine.PairEntry.Pair; public class EnWiktionaryXmlParser { + // TODO: look for {{ and [[ and ", "\n"); - } else if (matcher.group().equals("
")) {
-        lineEnd = safeIndexOf(wikiText, lineEnd, "
", "\n"); - } else if (matcher.group().equals("")) { - lineEnd = safeIndexOf(wikiText, lineEnd, "", "\n"); - } - } - if (lineStack.size() > 0 && firstNewline != -1) { - lineEnd = firstNewline + 1; - } - final String result = wikiText.substring(lineStart, lineEnd); - lineStart = lineEnd; - return cleanUpLine(result); - } - - - static int safeIndexOf(final String s, final int start, final String target, final String backup) { - int close = s.indexOf(target, start); - if (close != -1) { - return close + target.length(); - } - close = s.indexOf(backup, start); - if (close != -1) { - return close + backup.length(); - } - return s.length(); - } - - public static String cleanUpLine(String line) { - int pos; - while ((pos = line.indexOf(""); - if (end != -1) { - line = line.substring(0, pos) + line.substring(end + 3); - } - } - final Matcher matcher = whitespace.matcher(line); - line = matcher.replaceAll(" "); - line = line.trim(); - return line; - } - - String stuffedLine = null; - public void stuffLine(final String line) { - assert stuffedLine == null; - stuffedLine = line; - } - - - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiLineReaderTest.java b/src/com/hughes/android/dictionary/parser/WikiLineReaderTest.java deleted file mode 100644 index 6d275de..0000000 --- a/src/com/hughes/android/dictionary/parser/WikiLineReaderTest.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.hughes.android.dictionary.parser; - -import java.util.Arrays; - -import junit.framework.TestCase; - -public class WikiLineReaderTest extends TestCase { - - public void testSimple() { - final String wikiText = - "Hi" + "\n" + - "Hello thad you're '''pretty''' cool '''''over''''' there." + "\n" + - "hi " + "\n" + - "" + "\n" + - "asdf\n" + - "# {{template_in_list}}" + "\n" + - "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list - "here's [[some blah|some]] wikitext." + "\n" + - "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" + - "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" + - "== Header 2 ==" + "\n" + - "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" + - "{{unterminated}" + "\n" + - "[[unterminated]" + "\n" + - "=== {{header-template}} ===" + "\n"; - - final String[] expected = new String[] { - "Hi", - "Hello thad you're '''pretty''' cool '''''over''''' there.", - "hi", - "", - "asdf", - "# {{template_in_list}}", - "[[wikitext]]:[[wikitext]]", - "here's [[some blah|some]] wikitext.", - "here's a {{template|this has an = sign|blah=2|blah2=3| blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text.", - "== Header 2 ==", - "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}", - "{{unterminated}", - "[[unterminated]", - "=== {{header-template}} ===", - }; - - final WikiLineReader wikiLineReader = new WikiLineReader(wikiText); - for (int i = 0; i < expected.length; ++i) { - assertEquals(expected[i], wikiLineReader.readLine()); - } - final String end = wikiLineReader.readLine(); - if (end != null) { - System.out.println(end); - } - assertNull(end); - } - - public void testWikiHeading() { - assertNull(WikiHeading.getHeading("")); - assertNull(WikiHeading.getHeading("=")); - assertNull(WikiHeading.getHeading("==")); - assertNull(WikiHeading.getHeading("=a")); - assertNull(WikiHeading.getHeading("=a==")); - assertNull(WikiHeading.getHeading("===a==")); - assertNull(WikiHeading.getHeading("===a====")); - assertNull(WikiHeading.getHeading("a=")); - assertEquals("a", WikiHeading.getHeading("=a=").name); - assertEquals(1, WikiHeading.getHeading("=a=").depth); - assertEquals("aa", WikiHeading.getHeading("==aa==").name); - assertEquals(2, WikiHeading.getHeading("==aa==").depth); - } - - - public void testWikiFunction() { - assertNull(WikiFunction.getFunction("")); - assertNull(WikiFunction.getFunction("[[asdf]]")); - assertNull(WikiFunction.getFunction("asd [[asdf]]asdf ")); - assertEquals("a", WikiFunction.getFunction("{{a}}").name); - assertEquals("a", WikiFunction.getFunction("{{a|b}}").name); - assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name); - assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name); - assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name); - assertEquals(Arrays.asList("a", "b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args); - assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b")); - assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer")); - } - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiParser.java b/src/com/hughes/android/dictionary/parser/WikiParser.java deleted file mode 100644 index 37c7a53..0000000 --- a/src/com/hughes/android/dictionary/parser/WikiParser.java +++ /dev/null @@ -1,261 +0,0 @@ -package com.hughes.android.dictionary.parser; - -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.hughes.util.StringUtil; - -public class WikiParser { - - private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|(==+)\\s*$|"); - if (end == -1) { - callback.onUnterminated(" '''pretty''' cool '''''over''''' there." + "\n" + - "hi " + "\n" + - "" + "\n" + - "asdf\n" + - "# li" + "\n" + - "# li2" + "\n" + - "# {{template_in_list}}" + "\n" + - "## li2.2" + "\n" + - "Hi again." + "\n" + - "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list - "here's [[some blah|some]] wikitext." + "\n" + - "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" + - "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" + - "== Header 2 ==" + "\n" + - "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" + - "{{unterminated}" + "\n" + -// "==== Header 4 ====" + "\n" + -// "===== Header 5 =====" + "\n" + - "=== {{header-template}} ===" + "\n"; - - final String expected = "Hi Hello thad you're \n" + - "comment: not \n" + - " pretty cool over there. hi \n" + - "comment:\n" + - "multi-line\n" + - "# comment \n" + - "\n" + - "\n" + - " asdf\n" + - "LIST (#) li\n" + - "LIST (#) li2\n" + - "LIST (#) \n" + - "template:[template_in_list]{}\n" + - "\n" + - "LIST (##) li2.2\n" + - "\n" + - " Hi again. [[wikitext]]:[[wikitext]] here's [[some]] wikitext. here's a \n" + - "template:[template, this has an = sign]{blah=2, blah2=3, blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}\n" + - " and some more text.\n" + - "HEADER Header 2 \n" + - " \n" + - "template:[some-func]{blah={{nested-func|n2}}, blah2=asdf}\n" + - " \n" + - "template:[unterminate]{}" + "\n" + - "\n" + - "HEADER \n" + - "template:[header-template]{}\n" + - " \n" + - " "; - final PrintWikiCallback callback = new PrintWikiCallback(); - WikiParser.parse(text, callback); - assertEquals(expected, callback.builder.toString()); - - } - - - static final class PrintWikiCallback implements WikiCallback { - final StringBuilder builder = new StringBuilder(); - - @Override - public void onComment(String text) { - builder.append("\ncomment:").append(text).append("\n"); - } - - @Override - public void onFormatBold(boolean boldOn) { - builder.append(boldOn ? "" : ""); - } - - @Override - public void onFormatItalic(boolean italicOn) { - builder.append(italicOn ? "" : ""); - } - - @Override - public void onWikiLink(String[] args) { - builder.append("[[").append(args[args.length - 1]).append("]]"); - } - - @Override - public void onTemplate(final List positionalArgs, final Map namedArgs) { - builder.append("\ntemplate:").append(positionalArgs).append(namedArgs).append("\n"); - } - - @Override - public void onText(String text) { - builder.append(text); - } - - @Override - public void onHeadingStart(int depth) { - builder.append("\nHEADER"); - for (int i = 0; i < depth; ++i) { - builder.append(" "); - } - } - - @Override - public void onHeadingEnd(int depth) { - builder.append("\n"); - } - - @Override - public void onNewLine() { - builder.append(" "); - } - - @Override - public void onNewParagraph() { - builder.append("\n\n"); - } - - @Override - public void onListItemStart(String header, int[] section) { - builder.append("\n").append("LIST (").append(header).append(")"); - } - - @Override - public void onListItemEnd(String header, int[] section) { - //builder.append("\n"); - } - - @Override - public void onUnterminated(String start, String rest) { - //throw new RuntimeException("bad"); - } - - @Override - public void onInvalidHeaderEnd(String rest) { - throw new RuntimeException("bad"); - } - - } - - - -} diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index d028acb..fffe357 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -1,46 +1,196 @@ package com.hughes.android.dictionary.parser; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|", "\n"); return this; } if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { - System.err.println("Close without open!"); + errors.add("Close without open!"); end += 2; return this; } + if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { + isPlainText = true; + ++end; + return this; + } + if (this.matcher.find(start)) { end = this.matcher.start(1); + isPlainText = true; if (end == start) { - System.err.println(this.matcher.group()); + errors.add("Empty group: " + this.matcher.group()); assert false; } return this; @@ -129,21 +308,32 @@ public final class WikiTokenizer { end = wikiText.length(); return this; + } finally { + if (!errors.isEmpty()) { + System.err.println("Errors: " + errors + ", token=" + token()); + } + } + } public String token() { - return wikiText.substring(start, end); + final String token = wikiText.substring(start, end); + assert token.equals("\n") || !token.endsWith("\n") : token; + return token; } - private int escapedFind(final int start, final String toFind) { + private int escapedFindEnd(final int start, final String toFind) { assert tokenStack.isEmpty(); + final boolean insideFunction = toFind.equals("}}"); + int end = start; while (end < wikiText.length()) { if (matcher.find(end)) { final String matchText = matcher.group(); final int matchStart = matcher.start(); + assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); if (matchText.length() == 0) { assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; if (tokenStack.isEmpty() && toFind.equals("\n")) { @@ -152,6 +342,9 @@ public final class WikiTokenizer { ++end; } else if (tokenStack.isEmpty() && matchText.equals(toFind)) { // The normal return.... + if (insideFunction) { + addFunctionArg(insideFunction, matchStart); + } return matcher.end(); } else if (matchText.equals("[[") || matchText.equals("{{")) { tokenStack.add(matchText); @@ -159,48 +352,95 @@ public final class WikiTokenizer { if (tokenStack.size() > 0) { final String removed = tokenStack.remove(tokenStack.size() - 1); if (removed.equals("{{") && !matcher.group().equals("}}")) { - System.err.println("Unmatched {{ error: " + wikiText.substring(start)); + errors.add("Unmatched {{ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } else if (removed.equals("[[") && !matcher.group().equals("]]")) { - System.err.println("Unmatched [[ error: " + wikiText.substring(start)); + errors.add("Unmatched [[ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } } else { - System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n")); + errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n")); // If we were looking for a newline return safeIndexOf(wikiText, start, "\n", "\n"); } + } else if (matchText.equals("|")) { + if (tokenStack.isEmpty()) { + addFunctionArg(insideFunction, matchStart); + } + } else if (matchText.equals("=")) { + if (tokenStack.isEmpty()) { + lastUnescapedEqualsPos = matchStart; + } + // Do nothing. These can match spuriously, and if it's not the thing + // we're looking for, keep on going. } else if (matchText.equals(""); if (end == -1) { - System.err.println("Unmatched '''pretty''' cool '''''over''''' there." + "\n" + + "Hello =thad| you're '''pretty''' cool '''''over''''' there." + "\n" + "hi " + "\n" + @@ -36,7 +186,11 @@ public class WikiTokenizerTest extends TestCase { final String[] expectedTokens = new String[] { "Hi", "\n", - "Hello thad you're ", + "Hello ", + "=", + "thad", + "|", + " you're ", "", " ", "'''", @@ -80,8 +234,10 @@ public class WikiTokenizerTest extends TestCase { "\n", "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}", "\n", - "{{mismatched]]\n", - "[[mismatched}}\n", + "{{mismatched]]", + "\n", + "[[mismatched}}", + "\n", "{extraterminated", "}}", "\n", @@ -105,34 +261,4 @@ public class WikiTokenizerTest extends TestCase { assertEquals(Arrays.asList(expectedTokens), actualTokens); } - public void testWikiHeading() { - assertNull(WikiHeading.getHeading("")); - assertNull(WikiHeading.getHeading("=")); - assertNull(WikiHeading.getHeading("==")); - assertNull(WikiHeading.getHeading("=a")); - assertNull(WikiHeading.getHeading("=a==")); - assertNull(WikiHeading.getHeading("===a==")); - assertNull(WikiHeading.getHeading("===a====")); - assertNull(WikiHeading.getHeading("a=")); - assertEquals("a", WikiHeading.getHeading("=a=").name); - assertEquals(1, WikiHeading.getHeading("=a=").depth); - assertEquals("aa", WikiHeading.getHeading("==aa==").name); - assertEquals(2, WikiHeading.getHeading("==aa==").depth); - } - - - public void testWikiFunction() { - assertNull(WikiFunction.getFunction("")); - assertNull(WikiFunction.getFunction("[[asdf]]")); - assertNull(WikiFunction.getFunction("asd [[asdf]]asdf ")); - assertEquals("a", WikiFunction.getFunction("{{a}}").name); - assertEquals("a", WikiFunction.getFunction("{{a|b}}").name); - assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name); - assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name); - assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name); - assertEquals(Arrays.asList("b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args); - assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b")); - assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer")); - } - }