// Copyright 2011 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.hughes.android.dictionary.parser; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { static interface Callback { void onPlainText(WikiTokenizer wikiTokenizer); void onMarkup(WikiTokenizer wikiTokenizer); void onWikiLink(WikiTokenizer wikiTokenizer); void onNewline(WikiTokenizer wikiTokenizer); void onFunction(String functionName, List functionPositionArgs, Map functionNamedArgs); void onHeading(WikiTokenizer wikiTokenizer); void onListItem(WikiTokenizer wikiTokenizer); void onComment(WikiTokenizer wikiTokenizer); } //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); private static final Pattern wikiTokenEvent = Pattern.compile("(" + "\\{\\{|\\}\\}|" + "\\[\\[|\\]\\]|" + "\\||" + // Need the | because we might have to find unescaped pipes "=|" + // Need the = because we might have to find unescaped = "", "\n"); return this; } if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { errors.add("Close without open!"); end += 2; return this; } if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { isPlainText = true; ++end; return this; } if (this.matcher.find(start)) { end = this.matcher.start(1); isPlainText = true; if (end == start) { errors.add("Empty group: " + this.matcher.group()); assert false; } return this; } end = wikiText.length(); return this; } finally { if (!errors.isEmpty()) { System.err.println("Errors: " + errors + ", token=" + token()); } } } public String token() { final String token = wikiText.substring(start, end); assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; return token; } private int escapedFindEnd(final int start, final String toFind) { assert tokenStack.isEmpty(); final boolean insideFunction = toFind.equals("}}"); int end = start; int firstNewline = -1; while (end < wikiText.length()) { if (matcher.find(end)) { final String matchText = matcher.group(); final int matchStart = matcher.start(); assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); if (matchText.length() == 0) { assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; if (firstNewline == -1) { firstNewline = matcher.end(); } if (tokenStack.isEmpty() && toFind.equals("\n")) { return matchStart; } ++end; } else if (tokenStack.isEmpty() && matchText.equals(toFind)) { // The normal return.... if (insideFunction) { addFunctionArg(insideFunction, matchStart); } return matcher.end(); } else if (matchText.equals("[[") || matchText.equals("{{")) { tokenStack.add(matchText); } else if (matchText.equals("]]") || matchText.equals("}}")) { if (tokenStack.size() > 0) { final String removed = tokenStack.remove(tokenStack.size() - 1); if (removed.equals("{{") && !matcher.group().equals("}}")) { errors.add("Unmatched {{ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } else if (removed.equals("[[") && !matcher.group().equals("]]")) { errors.add("Unmatched [[ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } } else { errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n")); // If we were looking for a newline return safeIndexOf(wikiText, start, "\n", "\n"); } } else if (matchText.equals("|")) { if (tokenStack.isEmpty()) { addFunctionArg(insideFunction, matchStart); } } else if (matchText.equals("=")) { if (tokenStack.isEmpty()) { lastUnescapedEqualsPos = matchStart; } // Do nothing. These can match spuriously, and if it's not the thing // we're looking for, keep on going. } else if (matchText.equals(""); if (end == -1) { errors.add("Unmatched