package com.hughes.android.dictionary.parser;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class WikiTokenizer {
- public static interface Callback {
+ public interface Callback {
void onPlainText(final String text);
void onMarkup(WikiTokenizer wikiTokenizer);
void onWikiLink(WikiTokenizer wikiTokenizer);
int end = 0;
int start = -1;
- final List<String> errors = new ArrayList<String>();
- final List<String> tokenStack = new ArrayList<String>();
+ final List<String> errors = new ArrayList<>();
+ final List<String> tokenStack = new ArrayList<>();
private String headingWikiText;
private int lastUnescapedPipePos;
private int lastUnescapedEqualsPos;
- private final List<String> positionArgs = new ArrayList<String>();
- private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+ private final List<String> positionArgs = new ArrayList<>();
+ private final Map<String,String> namedArgs = new LinkedHashMap<>();
public WikiTokenizer(final String wikiText) {
namedArgs.clear();
}
- private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+ private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
"\\{\\{|" +
"\\[\\[|" +
"<!--|" +
"<math>|" +
"<ref>|" +
"[\n]"
- );
+ ).matcher("");
public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
- // Optimization...
- if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
+ // Statistical background, from EN-DE dictionary generation:
+ // out of 12083000 calls, 9697686 can be skipped via the test
+ // for ', \n and ((c - 0x3b) & 0xff9f) < 2 (which covers among others
+ // <, { and [).
+ // This increased to 10006466 checking for <, { and [ specifically,
+ // and is minimally faster overall.
+ // A even more precise one using regex and checking for {{, [[, <!--, '',
+ // <pre>, <math>, <ref> and \n increased that to 10032846.
+ // Regex thus seems far too costly for a measly increase from 80%/82% to 83% rejection rate
+ // However completely removing it changes output (likely a bug), so leave it in for now
+ // but at least run it only on the 18% not caught by the faster logic.
+ // Original runtime: 1m29.708s
+ // Optimized: 1m19.170s
+ // Regex removed: 1m20.314s (not statistically significant)
+ boolean matched = false;
+ for (int i = 0; i < wikiText.length(); i++) {
+ int c = wikiText.charAt(i);
+ if (c == '\'' || c == '\n' || c == '<' || c == '[' || c == '{') {
+ matched = true;
+ break;
+ }
+ }
+ if (!matched || !POSSIBLE_WIKI_TEXT.reset(wikiText).find()) {
callback.onPlainText(wikiText);
} else {
final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
}
// Eat a newline if we're looking at one:
- final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
+ final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028' || wikiText.charAt(end) == '\u2029';
if (atNewline) {
justReturnedNewline = true;
++end;
end = this.matcher.start(1);
isPlainText = true;
if (end == start) {
- errors.add("Empty group: " + this.matcher.group());
+ // stumbled over a new type of newline?
+ // Or matcher is out of sync with checks above
+ errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
assert false;
+ throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
}
return this;
}
return token;
}
- final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
+ static final String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
private int escapedFindEnd(final int start, final String toFind) {
assert tokenStack.isEmpty();
int end = start;
int firstNewline = -1;
int[] nextMatch = new int[patterns.length];
- for (int i = 0; i < nextMatch.length; ++i) {
- nextMatch[i] = -2;
- }
+ Arrays.fill(nextMatch, -2);
int singleBrackets = 0;
while (end < wikiText.length()) {
// Manual replacement for matcher.find(end),
// We were looking for the end, we got it.
return end;
}
- errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+ errors.add("Couldn't find: " + (toFind.equals("\n") ? "newline" : toFind) + ", "+ wikiText.substring(start));
if (firstNewline != -1) {
return firstNewline;
}
lastUnescapedPipePos = matchStart;
}
- static final String trimNewlines(String s) {
+ static String trimNewlines(String s) {
while (s.startsWith("\n")) {
s = s.substring(1);
}