X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FWikiTokenizer.java;h=886e4f9074a20de21b36e6d85199dfa680aa7ca0;hb=2f2eaf2360096aa926fba1b03916a5fe23fbb707;hp=9bf2368152d1316bdd5b4cd33907ef6c1b470e2a;hpb=f864c3ee498c0b78f06fbc76eef3268f0c2b337e;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index 9bf2368..886e4f9 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -74,7 +74,7 @@ public final class WikiTokenizer { } //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(" + + private static final Pattern wikiTokenEvent = Pattern.compile( "\\{\\{|\\}\\}|" + "\\[\\[|\\]\\]|" + "\\||" + // Need the | because we might have to find unescaped pipes @@ -84,7 +84,7 @@ public final class WikiTokenizer { "
|" +
             "|" +
             "|" +
-            "$)", Pattern.MULTILINE);
+            "\n", Pattern.MULTILINE);
     private static final String listChars = "*#:;";
 
 
@@ -123,6 +123,7 @@ public final class WikiTokenizer {
 
     public WikiTokenizer(String wikiText, final boolean isNewline) {
         wikiText = wikiText.replace('\u2028', '\n');
+        wikiText = wikiText.replace('\u2029', '\n');
         wikiText = wikiText.replace('\u0085', '\n');
         this.wikiText = wikiText;
         this.matcher = wikiTokenEvent.matcher(wikiText);
@@ -158,7 +159,7 @@ public final class WikiTokenizer {
                 "
|" +
                 "|" +
                 "|" +
-                "[\n]"
+                "\n"
             ).matcher("");
 
     public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
@@ -356,7 +357,7 @@ public final class WikiTokenizer {
             }
 
             // Eat a newline if we're looking at one:
-            final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028' || wikiText.charAt(end) == '\u2029';
+            final boolean atNewline = wikiText.charAt(end) == '\n';
             if (atNewline) {
                 justReturnedNewline = true;
                 ++end;
@@ -462,20 +463,28 @@ public final class WikiTokenizer {
             }
 
 
-            if (this.matcher.find(start)) {
-                end = this.matcher.start(1);
+            while (end < wikiText.length()) {
+                int c = wikiText.charAt(end);
+                if (c == '\n' || c == '\'' || ((c - 0x1b) & 0xff9f) < 3) {
+                    matcher.region(end, wikiText.length());
+                    if (matcher.lookingAt()) break;
+                }
+                end++;
+            }
+            if (end != wikiText.length()) {
                 isPlainText = true;
                 if (end == start) {
                     // stumbled over a new type of newline?
                     // Or matcher is out of sync with checks above
                     errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
                     assert false;
+                    // Note: all newlines should be normalize to \n before calling this function
                     throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
                 }
                 return this;
             }
 
-            end = wikiText.length();
+            isPlainText = true;
             return this;
 
         } finally {