X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FWikiTokenizer.java;h=a7863c7f42ee3dc24882019e4bbdbf88658e8409;hb=e87d071962ee37719c9bea6740d93913ed4d8c7b;hp=b79013d2726847f1370170104ddb668d05afe5a8;hpb=6e732a6929b997865f763c26f5bbfd6dbf35c4fe;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index b79013d..a7863c7 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -22,542 +22,636 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { - - public static interface Callback { - void onPlainText(final String text); - void onMarkup(WikiTokenizer wikiTokenizer); - void onWikiLink(WikiTokenizer wikiTokenizer); - void onNewline(WikiTokenizer wikiTokenizer); - void onFunction(final WikiTokenizer tokenizer, String functionName, List functionPositionArgs, - Map functionNamedArgs); - void onHeading(WikiTokenizer wikiTokenizer); - void onListItem(WikiTokenizer wikiTokenizer); - void onComment(WikiTokenizer wikiTokenizer); - void onHtml(WikiTokenizer wikiTokenizer); - } - - //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(" + - "\\{\\{|\\}\\}|" + - "\\[\\[|\\]\\]|" + - "\\||" + // Need the | because we might have to find unescaped pipes - "=|" + // Need the = because we might have to find unescaped = - "", "\n"); + return this; + } + + if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { + errors.add("Close without open!"); + end += 2; + return this; + } + + if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { + isPlainText = true; + ++end; + return this; + } + + + if (this.matcher.find(start)) { + end = this.matcher.start(1); + isPlainText = true; + if (end == start) { + errors.add("Empty group: " + this.matcher.group()); + assert false; + } + return this; + } + + end = wikiText.length(); + return this; + + } finally { + if (!errors.isEmpty()) { + System.err.println("Errors: " + errors + ", token=" + token()); + } } - final int headerTitleEnd = end; - headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd); - // Skip ===... - while (end < len && ++end < len && wikiText.charAt(end) == '=') {} - final int headerEnd = end; - if (headerEnd - headerTitleEnd != headingDepth) { - errors.add("Mismatched header depth: " + token()); + + } + + public String token() { + final String token = wikiText.substring(start, end); + assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; + return token; + } + + final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "", "\n"); - return this; - } - - if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { - errors.add("Close without open!"); - end += 2; - return this; - } - - if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { - isPlainText = true; - ++end; - return this; - } - - - if (this.matcher.find(start)) { - end = this.matcher.start(1); - isPlainText = true; - if (end == start) { - errors.add("Empty group: " + this.matcher.group()); - assert false; - } - return this; - } - - end = wikiText.length(); - return this; - - } finally { - if (!errors.isEmpty()) { - System.err.println("Errors: " + errors + ", token=" + token()); - } - } - - } - - public String token() { - final String token = wikiText.substring(start, end); - assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; - return token; - } - - private int escapedFindEnd(final int start, final String toFind) { - assert tokenStack.isEmpty(); - - final boolean insideFunction = toFind.equals("}}"); - - int end = start; - int firstNewline = -1; - while (end < wikiText.length()) { - if (matcher.find(end)) { - final String matchText = matcher.group(); - final int matchStart = matcher.start(); - - assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); - if (matchText.length() == 0) { - assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; - if (firstNewline == -1) { - firstNewline = matcher.end(); - } - if (tokenStack.isEmpty() && toFind.equals("\n")) { - return matchStart; - } - ++end; - } else if (tokenStack.isEmpty() && matchText.equals(toFind)) { - // The normal return.... - if (insideFunction) { - addFunctionArg(insideFunction, matchStart); - } - return matcher.end(); - } else if (matchText.equals("[[") || matchText.equals("{{")) { - tokenStack.add(matchText); - } else if (matchText.equals("]]") || matchText.equals("}}")) { - if (tokenStack.size() > 0) { - final String removed = tokenStack.remove(tokenStack.size() - 1); - if (removed.equals("{{") && !matcher.group().equals("}}")) { - errors.add("Unmatched {{ error: " + wikiText.substring(start)); - return safeIndexOf(wikiText, start, "\n", "\n"); - } else if (removed.equals("[[") && !matcher.group().equals("]]")) { - errors.add("Unmatched [[ error: " + wikiText.substring(start)); - return safeIndexOf(wikiText, start, "\n", "\n"); + int singleBrackets = 0; + while (end < wikiText.length()) { + // Manual replacement for matcher.find(end), + // because Java regexp is a ridiculously slow implementation. + // Initialize to always match the end. + int matchIdx = 0; + for (int i = 0; i < nextMatch.length; ++i) { + if (nextMatch[i] <= end) { + nextMatch[i] = wikiText.indexOf(patterns[i], end); + if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length(); + } + if (nextMatch[i] < nextMatch[matchIdx]) { + matchIdx = i; + } } - } else { - errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n")); - // If we were looking for a newline - return safeIndexOf(wikiText, start, "\n", "\n"); - } - } else if (matchText.equals("|")) { - if (tokenStack.isEmpty()) { - addFunctionArg(insideFunction, matchStart); - } - } else if (matchText.equals("=")) { - if (tokenStack.isEmpty()) { - lastUnescapedEqualsPos = matchStart; - } - // Do nothing. These can match spuriously, and if it's not the thing - // we're looking for, keep on going. - } else if (matchText.equals(""); - if (end == -1) { - errors.add("Unmatched ", matchStart); + if (end == -1) { + errors.add("Unmatched