Optimize plaintext dispatch path.

[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / WikiTokenizer.java
diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java

index a7863c7f42ee3dc24882019e4bbdbf88658e8409..9bf2368152d1316bdd5b4cd33907ef6c1b470e2a 100644 (file)
--- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java
+++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java
@@ -14,16 +14,13 @@
  
  package com.hughes.android.dictionary.parser;
  
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
  public final class WikiTokenizer {
  
-    public static interface Callback {
+    public interface Callback {
          void onPlainText(final String text);
          void onMarkup(WikiTokenizer wikiTokenizer);
          void onWikiLink(WikiTokenizer wikiTokenizer);
@@ -99,8 +96,8 @@ public final class WikiTokenizer {
      int end = 0;
      int start = -1;
  
-    final List<String> errors = new ArrayList<String>();
-    final List<String> tokenStack = new ArrayList<String>();
+    final List<String> errors = new ArrayList<>();
+    final List<String> tokenStack = new ArrayList<>();
  
  
      private String headingWikiText;
@@ -116,8 +113,8 @@ public final class WikiTokenizer {
  
      private int lastUnescapedPipePos;
      private int lastUnescapedEqualsPos;
-    private final List<String> positionArgs = new ArrayList<String>();
-    private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+    private final List<String> positionArgs = new ArrayList<>();
+    private final Map<String,String> namedArgs = new LinkedHashMap<>();
  
  
      public WikiTokenizer(final String wikiText) {
@@ -153,7 +150,7 @@ public final class WikiTokenizer {
          namedArgs.clear();
      }
  
-    private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+    private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
                  "\\{\\{|" +
                  "\\[\\[|" +
                  "<!--|" +
@@ -162,11 +159,32 @@ public final class WikiTokenizer {
                  "<math>|" +
                  "<ref>|" +
                  "[\n]"
-            );
+            ).matcher("");
  
      public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
-        // Optimization...
-        if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
+        // Statistical background, from EN-DE dictionary generation:
+        // out of 12083000 calls, 9697686 can be skipped via the test
+        // for ', \n and ((c - 0x3b) & 0xff9f) < 2 (which covers among others
+        // <, { and [).
+        // This increased to 10006466 checking for <, { and [ specifically,
+        // and is minimally faster overall.
+        // A even more precise one using regex and checking for {{, [[, <!--, '',
+        // <pre>, <math>, <ref> and \n increased that to 10032846.
+        // Regex thus seems far too costly for a measly increase from 80%/82% to 83% rejection rate
+        // However completely removing it changes output (likely a bug), so leave it in for now
+        // but at least run it only on the 18% not caught by the faster logic.
+        // Original runtime: 1m29.708s
+        // Optimized: 1m19.170s
+        // Regex removed: 1m20.314s (not statistically significant)
+        boolean matched = false;
+        for (int i = 0; i < wikiText.length(); i++) {
+            int c = wikiText.charAt(i);
+            if (c == '\'' || c == '\n' || c == '<' || c == '[' || c == '{') {
+                matched = true;
+                break;
+            }
+        }
+        if (!matched || !POSSIBLE_WIKI_TEXT.reset(wikiText).find()) {
              callback.onPlainText(wikiText);
          } else {
              final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
@@ -338,7 +356,7 @@ public final class WikiTokenizer {
              }
  
              // Eat a newline if we're looking at one:
-            final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
+            final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028' || wikiText.charAt(end) == '\u2029';
              if (atNewline) {
                  justReturnedNewline = true;
                  ++end;
@@ -448,8 +466,11 @@ public final class WikiTokenizer {
                  end = this.matcher.start(1);
                  isPlainText = true;
                  if (end == start) {
-                    errors.add("Empty group: " + this.matcher.group());
+                    // stumbled over a new type of newline?
+                    // Or matcher is out of sync with checks above
+                    errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
                      assert false;
+                    throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
                  }
                  return this;
              }
@@ -471,7 +492,7 @@ public final class WikiTokenizer {
          return token;
      }
  
-    final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
+    static final String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
      private int escapedFindEnd(final int start, final String toFind) {
          assert tokenStack.isEmpty();
  
@@ -480,9 +501,7 @@ public final class WikiTokenizer {
          int end = start;
          int firstNewline = -1;
          int[] nextMatch = new int[patterns.length];
-        for (int i = 0; i < nextMatch.length; ++i) {
-            nextMatch[i] = -2;
-        }
+        Arrays.fill(nextMatch, -2);
          int singleBrackets = 0;
          while (end < wikiText.length()) {
              // Manual replacement for matcher.find(end),
@@ -579,7 +598,7 @@ public final class WikiTokenizer {
              // We were looking for the end, we got it.
              return end;
          }
-        errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+        errors.add("Couldn't find: " + (toFind.equals("\n") ? "newline" : toFind) + ", "+ wikiText.substring(start));
          if (firstNewline != -1) {
              return firstNewline;
          }
@@ -602,7 +621,7 @@ public final class WikiTokenizer {
          lastUnescapedPipePos = matchStart;
      }
  
-    static final String trimNewlines(String s) {
+    static String trimNewlines(String s) {
          while (s.startsWith("\n")) {
              s = s.substring(1);
          }