package com.hughes.android.dictionary.parser;
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class WikiTokenizer {
- public static interface Callback {
+ public interface Callback {
void onPlainText(final String text);
void onMarkup(WikiTokenizer wikiTokenizer);
void onWikiLink(WikiTokenizer wikiTokenizer);
}
//private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
- private static final Pattern wikiTokenEvent = Pattern.compile("(" +
+ private static final Pattern wikiTokenEvent = Pattern.compile(
"\\{\\{|\\}\\}|" +
"\\[\\[|\\]\\]|" +
"\\||" + // Need the | because we might have to find unescaped pipes
"<pre>|" +
"<math>|" +
"<ref>|" +
- "$)", Pattern.MULTILINE);
+ "\n", Pattern.MULTILINE);
private static final String listChars = "*#:;";
int end = 0;
int start = -1;
- final List<String> errors = new ArrayList<String>();
- final List<String> tokenStack = new ArrayList<String>();
+ final List<String> errors = new ArrayList<>();
+ final List<TokenDelim> tokenStack = new ArrayList<>();
private String headingWikiText;
private int lastUnescapedPipePos;
private int lastUnescapedEqualsPos;
- private final List<String> positionArgs = new ArrayList<String>();
- private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+ private final List<String> positionArgs = new ArrayList<>();
+ private final Map<String,String> namedArgs = new LinkedHashMap<>();
public WikiTokenizer(final String wikiText) {
public WikiTokenizer(String wikiText, final boolean isNewline) {
wikiText = wikiText.replace('\u2028', '\n');
+ wikiText = wikiText.replace('\u2029', '\n');
wikiText = wikiText.replace('\u0085', '\n');
this.wikiText = wikiText;
this.matcher = wikiTokenEvent.matcher(wikiText);
namedArgs.clear();
}
- private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
+ private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
"\\{\\{|" +
"\\[\\[|" +
"<!--|" +
"<pre>|" +
"<math>|" +
"<ref>|" +
- "[\n]"
- );
+ "\n"
+ ).matcher("");
public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
- // Optimization...
- if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
+ // Statistical background, from EN-DE dictionary generation:
+ // out of 12083000 calls, 9697686 can be skipped via the test
+ // for ', \n and ((c - 0x3b) & 0xff9f) < 2 (which covers among others
+ // <, { and [).
+ // This increased to 10006466 checking for <, { and [ specifically,
+ // and is minimally faster overall.
+ // A even more precise one using regex and checking for {{, [[, <!--, '',
+ // <pre>, <math>, <ref> and \n increased that to 10032846.
+ // Regex thus seems far too costly for a measly increase from 80%/82% to 83% rejection rate
+ // However completely removing it changes output (likely a bug), so leave it in for now
+ // but at least run it only on the 18% not caught by the faster logic.
+ // Original runtime: 1m29.708s
+ // Optimized: 1m19.170s
+ // Regex removed: 1m20.314s (not statistically significant)
+ boolean matched = false;
+ for (int i = 0; i < wikiText.length(); i++) {
+ int c = wikiText.charAt(i);
+ if (c == '\'' || c == '\n' || c == '<' || c == '[' || c == '{') {
+ matched = true;
+ break;
+ }
+ }
+ if (!matched || !POSSIBLE_WIKI_TEXT.reset(wikiText).find()) {
callback.onPlainText(wikiText);
} else {
final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
}
// Eat a newline if we're looking at one:
- final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
+ final boolean atNewline = wikiText.charAt(end) == '\n';
if (atNewline) {
justReturnedNewline = true;
++end;
// Skip non-=...
if (end < len) {
final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
- final int closingEquals = escapedFindEnd(end, "=");
+ final int closingEquals = escapedFindEnd(end, TokenDelim.EQUALS);
if (wikiText.charAt(closingEquals - 1) == '=') {
end = closingEquals - 1;
} else {
if (listChars.indexOf(firstChar) != -1) {
while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
listPrefixEnd = end;
- end = escapedFindEnd(start, "\n");
+ end = escapedFindEnd(start, TokenDelim.NEWLINE);
return this;
}
}
}
if (wikiText.startsWith("[[", start)) {
- end = escapedFindEnd(start + 2, "]]");
+ end = escapedFindEnd(start + 2, TokenDelim.DBRACKET_CLOSE);
isWikiLink = errors.isEmpty();
return this;
}
if (wikiText.startsWith("{{", start)) {
- end = escapedFindEnd(start + 2, "}}");
+ end = escapedFindEnd(start + 2, TokenDelim.BRACE_CLOSE);
isFunction = errors.isEmpty();
return this;
}
}
- if (this.matcher.find(start)) {
- end = this.matcher.start(1);
+ while (end < wikiText.length()) {
+ int c = wikiText.charAt(end);
+ if (c == '\n' || c == '\'' || ((c - 0x1b) & 0xff9f) < 3) {
+ matcher.region(end, wikiText.length());
+ if (matcher.lookingAt()) break;
+ }
+ end++;
+ }
+ if (end != wikiText.length()) {
isPlainText = true;
if (end == start) {
- errors.add("Empty group: " + this.matcher.group());
+ // stumbled over a new type of newline?
+ // Or matcher is out of sync with checks above
+ errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
assert false;
+ // Note: all newlines should be normalize to \n before calling this function
+ throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
}
return this;
}
- end = wikiText.length();
+ isPlainText = true;
return this;
} finally {
return token;
}
- final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
- private int escapedFindEnd(final int start, final String toFind) {
+ enum TokenDelim { NEWLINE, BRACE_OPEN, BRACE_CLOSE, DBRACKET_OPEN, DBRACKET_CLOSE, BRACKET_OPEN, BRACKET_CLOSE, PIPE, EQUALS, COMMENT }
+
+ private int tokenDelimLen(TokenDelim d) {
+ switch (d) {
+ case NEWLINE:
+ case BRACKET_OPEN:
+ case BRACKET_CLOSE:
+ case PIPE:
+ case EQUALS:
+ return 1;
+ case BRACE_OPEN:
+ case BRACE_CLOSE:
+ case DBRACKET_OPEN:
+ case DBRACKET_CLOSE:
+ return 2;
+ case COMMENT:
+ return 4;
+ default:
+ throw new RuntimeException();
+ }
+ }
+
+ static final String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
+ private int escapedFindEnd(final int start, final TokenDelim toFind) {
assert tokenStack.isEmpty();
- final boolean insideFunction = toFind.equals("}}");
+ final boolean insideFunction = toFind == TokenDelim.BRACE_CLOSE;
int end = start;
int firstNewline = -1;
- int[] nextMatch = new int[patterns.length];
- for (int i = 0; i < nextMatch.length; ++i) {
- nextMatch[i] = -2;
- }
int singleBrackets = 0;
while (end < wikiText.length()) {
// Manual replacement for matcher.find(end),
// because Java regexp is a ridiculously slow implementation.
// Initialize to always match the end.
- int matchIdx = 0;
- for (int i = 0; i < nextMatch.length; ++i) {
- if (nextMatch[i] <= end) {
- nextMatch[i] = wikiText.indexOf(patterns[i], end);
- if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length();
- }
- if (nextMatch[i] < nextMatch[matchIdx]) {
- matchIdx = i;
- }
+ TokenDelim match = TokenDelim.NEWLINE;
+ int matchStart = end;
+ for (; matchStart < wikiText.length(); matchStart++) {
+ int i = matchStart;
+ int c = wikiText.charAt(i);
+ if (c == '\n') break;
+ if (c == '{' && wikiText.startsWith("{{", i)) { match = TokenDelim.BRACE_OPEN; break; }
+ if (c == '}' && wikiText.startsWith("}}", i)) { match = TokenDelim.BRACE_CLOSE; break; }
+ if (c == '[') { match = wikiText.startsWith("[[", i) ? TokenDelim.DBRACKET_OPEN : TokenDelim.BRACKET_OPEN ; break; }
+ if (c == ']') { match = wikiText.startsWith("]]", i) ? TokenDelim.DBRACKET_CLOSE : TokenDelim.BRACKET_CLOSE ; break; }
+ if (c == '|') { match = TokenDelim.PIPE; break; }
+ if (c == '=') { match = TokenDelim.EQUALS; break; }
+ if (c == '<' && wikiText.startsWith("<!--", i)) { match = TokenDelim.COMMENT; break; }
}
- int matchStart = nextMatch[matchIdx];
- String matchText = patterns[matchIdx];
- int matchEnd = matchStart + matchText.length();
- if (matchIdx == 0) {
- matchText = "";
- matchEnd = matchStart;
+ int matchEnd = matchStart + (match == TokenDelim.NEWLINE ? 0 : tokenDelimLen(match));
+ if (match != TokenDelim.NEWLINE && tokenStack.isEmpty() && match == toFind) {
+ // The normal return....
+ if (insideFunction) {
+ addFunctionArg(insideFunction, matchStart);
+ }
+ return matchEnd;
}
-
- assert matchEnd > end || matchText.length() == 0: "Group=" + matchText;
- if (matchText.length() == 0) {
+ switch (match) {
+ case NEWLINE:
assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
if (firstNewline == -1) {
firstNewline = matchEnd;
}
- if (tokenStack.isEmpty() && toFind.equals("\n")) {
+ if (tokenStack.isEmpty() && toFind == TokenDelim.NEWLINE) {
return matchStart;
}
++end;
- } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
- // The normal return....
- if (insideFunction) {
- addFunctionArg(insideFunction, matchStart);
- }
- return matchEnd;
- } else if (matchText.equals("[")) {
+ break;
+ case BRACKET_OPEN:
singleBrackets++;
- } else if (matchText.equals("]")) {
+ break;
+ case BRACKET_CLOSE:
if (singleBrackets > 0) singleBrackets--;
- } else if (matchText.equals("[[") || matchText.equals("{{")) {
- tokenStack.add(matchText);
- } else if (matchText.equals("]]") || matchText.equals("}}")) {
- if (tokenStack.size() > 0) {
- final String removed = tokenStack.remove(tokenStack.size() - 1);
- if (removed.equals("{{") && !matchText.equals("}}")) {
+ break;
+ case DBRACKET_OPEN:
+ case BRACE_OPEN:
+ tokenStack.add(match);
+ break;
+ case DBRACKET_CLOSE:
+ case BRACE_CLOSE:
+ if (!tokenStack.isEmpty()) {
+ final TokenDelim removed = tokenStack.remove(tokenStack.size() - 1);
+ if (removed == TokenDelim.BRACE_OPEN && match != TokenDelim.BRACE_CLOSE) {
if (singleBrackets >= 2) { // assume this is really two closing single ]
singleBrackets -= 2;
tokenStack.add(removed);
errors.add("Unmatched {{ error: " + wikiText.substring(start, matchEnd));
return safeIndexOf(wikiText, start, "\n", "\n");
}
- } else if (removed.equals("[[") && !matchText.equals("]]")) {
+ } else if (removed == TokenDelim.DBRACKET_OPEN && match != TokenDelim.DBRACKET_CLOSE) {
errors.add("Unmatched [[ error: " + wikiText.substring(start, matchEnd));
return safeIndexOf(wikiText, start, "\n", "\n");
}
} else {
- errors.add("Pop too many " + matchText + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
+ errors.add("Pop too many " + wikiText.substring(matchStart, matchEnd) + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
// If we were looking for a newline
return safeIndexOf(wikiText, start, "\n", "\n");
}
- } else if (matchText.equals("|")) {
+ break;
+ case PIPE:
if (tokenStack.isEmpty()) {
addFunctionArg(insideFunction, matchStart);
}
- } else if (matchText.equals("=")) {
+ break;
+ case EQUALS:
if (tokenStack.isEmpty()) {
lastUnescapedEqualsPos = matchStart;
}
// Do nothing. These can match spuriously, and if it's not the thing
// we're looking for, keep on going.
- } else if (matchText.equals("<!--")) {
+ break;
+ case COMMENT:
end = wikiText.indexOf("-->", matchStart);
if (end == -1) {
errors.add("Unmatched <!-- error: " + wikiText.substring(start));
return safeIndexOf(wikiText, start, "\n", "\n");
}
- } else if (matchText.equals("''") || (matchText.startsWith("<") && matchText.endsWith(">"))) {
- // Don't care.
- } else {
- assert false : "Match text='" + matchText + "'";
- throw new IllegalStateException();
+ break;
+ default:
+ throw new RuntimeException();
}
// Inside the while loop. Just go forward.
end = Math.max(end, matchEnd);
}
- if (toFind.equals("\n") && tokenStack.isEmpty()) {
+ if (toFind == TokenDelim.NEWLINE && tokenStack.isEmpty()) {
// We were looking for the end, we got it.
return end;
}
- errors.add("Couldn't find: " + (toFind.equals("\n") ? "newline" : toFind) + ", "+ wikiText.substring(start));
+ errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
if (firstNewline != -1) {
return firstNewline;
}
lastUnescapedPipePos = matchStart;
}
- static final String trimNewlines(String s) {
+ static String trimNewlines(String s) {
while (s.startsWith("\n")) {
s = s.substring(1);
}