X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FWikiTokenizer.java;h=ca0193a5074604665ca7db55c28e9511e4e2f0f8;hb=15cb7acd69697acf9643396bf607e9b22fc73d08;hp=d6c8901aa6a6b6541c1d5b2ccd0e4dc4af56f507;hpb=21e752e044b6c0dd7d24e6da143068326beab2e3;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index d6c8901..ca0193a 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -14,633 +14,697 @@ package com.hughes.android.dictionary.parser; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { - - public static interface Callback { - void onPlainText(final String text); - void onMarkup(WikiTokenizer wikiTokenizer); - void onWikiLink(WikiTokenizer wikiTokenizer); - void onNewline(WikiTokenizer wikiTokenizer); - void onFunction(final WikiTokenizer tokenizer, String functionName, List functionPositionArgs, - Map functionNamedArgs); - void onHeading(WikiTokenizer wikiTokenizer); - void onListItem(WikiTokenizer wikiTokenizer); - void onComment(WikiTokenizer wikiTokenizer); - void onHtml(WikiTokenizer wikiTokenizer); - } - - public static class DoNothingCallback implements Callback { - - @Override - public void onPlainText(String text) { - } - - @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onNewline(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onFunction(WikiTokenizer tokenizer, String functionName, - List functionPositionArgs, Map functionNamedArgs) { - } - - @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onComment(WikiTokenizer wikiTokenizer) { - } - - @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - } - } - - //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(" + - "\\{\\{|\\}\\}|" + - "\\[\\[|\\]\\]|" + - "\\||" + // Need the | because we might have to find unescaped pipes - "=|" + // Need the = because we might have to find unescaped = - "", "\n"); + return this; + } + + if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { + errors.add("Close without open!"); + end += 2; + return this; + } + + if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { + isPlainText = true; + ++end; + return this; + } + + + while (end < wikiText.length()) { + int c = wikiText.charAt(end); + if (c == '\n' || c == '\'' || ((c - 0x1b) & 0xff9f) < 3) { + matcher.region(end, wikiText.length()); + if (matcher.lookingAt()) break; + } + end++; + } + if (end != wikiText.length()) { + isPlainText = true; + if (end == start) { + // stumbled over a new type of newline? + // Or matcher is out of sync with checks above + errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end)); + assert false; + // Note: all newlines should be normalize to \n before calling this function + throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors); + } + return this; + } + + isPlainText = true; + return this; + + } finally { + if (!errors.isEmpty()) { + System.err.println("Errors: " + errors + ", token=" + token()); + } } - final int headerTitleEnd = end; - headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd); - // Skip ===... - while (end < len && ++end < len && wikiText.charAt(end) == '=') {} - final int headerEnd = end; - if (headerEnd - headerTitleEnd != headingDepth) { - errors.add("Mismatched header depth: " + token()); + + } + + public String token() { + final String token = wikiText.substring(start, end); + assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; + return token; + } + + enum TokenDelim { NEWLINE, BRACE_OPEN, BRACE_CLOSE, DBRACKET_OPEN, DBRACKET_CLOSE, BRACKET_OPEN, BRACKET_CLOSE, PIPE, EQUALS, COMMENT } + + private int tokenDelimLen(TokenDelim d) { + switch (d) { + case NEWLINE: + case BRACKET_OPEN: + case BRACKET_CLOSE: + case PIPE: + case EQUALS: + return 1; + case BRACE_OPEN: + case BRACE_CLOSE: + case DBRACKET_OPEN: + case DBRACKET_CLOSE: + return 2; + case COMMENT: + return 4; + default: + throw new RuntimeException(); } - return this; - } - if (listChars.indexOf(firstChar) != -1) { - while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {} - listPrefixEnd = end; - end = escapedFindEnd(start, "\n"); - return this; - } - } - - if (wikiText.startsWith("'''", start)) { - isMarkup = true; - end = start + 3; - return this; - } - - if (wikiText.startsWith("''", start)) { - isMarkup = true; - end = start + 2; - return this; - } - - if (wikiText.startsWith("[[", start)) { - end = escapedFindEnd(start + 2, "]]"); - isWikiLink = errors.isEmpty(); - return this; - } - - if (wikiText.startsWith("{{", start)) { - end = escapedFindEnd(start + 2, "}}"); - isFunction = errors.isEmpty(); - return this; - } - - if (wikiText.startsWith("
", start)) {
-      end = safeIndexOf(wikiText, start, "
", "\n"); - isHtml = true; - return this; - } - - if (wikiText.startsWith("", start)) { - end = safeIndexOf(wikiText, start, "", "\n"); - isHtml = true; - return this; - } - - if (wikiText.startsWith("", start)) { - end = safeIndexOf(wikiText, start, "", "\n"); - isHtml = true; - return this; - } - - if (wikiText.startsWith("", "\n"); - return this; - } - - if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { - errors.add("Close without open!"); - end += 2; - return this; - } - - if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { - isPlainText = true; - ++end; - return this; - } - - - if (this.matcher.find(start)) { - end = this.matcher.start(1); - isPlainText = true; - if (end == start) { - errors.add("Empty group: " + this.matcher.group()); - assert false; - } - return this; - } - - end = wikiText.length(); - return this; - - } finally { - if (!errors.isEmpty()) { - System.err.println("Errors: " + errors + ", token=" + token()); - } - } - - } - - public String token() { - final String token = wikiText.substring(start, end); - assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; - return token; - } - - final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "|", "=", "", matchStart); + if (end == -1) { + errors.add("Unmatched "); - if (end == -1) { - errors.add("Unmatched