X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2FWikiTokenizer.java;h=e2d59c45e85f90706cb51a330992d7d4f4222cad;hb=7819736ae570bf597936f0dc640f60644da15fc8;hp=d028acb62ff1481c55ac50fe758f09ce0f1f59ea;hpb=eeb5667c56b2074b7eeac531589c9f1bf55ba738;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java index d028acb..e2d59c4 100644 --- a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -1,97 +1,288 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.parser; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; public final class WikiTokenizer { //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); - private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|", "\n"); return this; } if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { - System.err.println("Close without open!"); + errors.add("Close without open!"); end += 2; return this; } + if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') { + isPlainText = true; + ++end; + return this; + } + if (this.matcher.find(start)) { end = this.matcher.start(1); + isPlainText = true; if (end == start) { - System.err.println(this.matcher.group()); + errors.add("Empty group: " + this.matcher.group()); assert false; } return this; @@ -129,29 +328,47 @@ public final class WikiTokenizer { end = wikiText.length(); return this; + } finally { + if (!errors.isEmpty()) { + System.err.println("Errors: " + errors + ", token=" + token()); + } + } + } public String token() { - return wikiText.substring(start, end); + final String token = wikiText.substring(start, end); + assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'"; + return token; } - private int escapedFind(final int start, final String toFind) { + private int escapedFindEnd(final int start, final String toFind) { assert tokenStack.isEmpty(); + final boolean insideFunction = toFind.equals("}}"); + int end = start; + int firstNewline = -1; while (end < wikiText.length()) { if (matcher.find(end)) { final String matchText = matcher.group(); final int matchStart = matcher.start(); + assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group(); if (matchText.length() == 0) { assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; + if (firstNewline == -1) { + firstNewline = matcher.end(); + } if (tokenStack.isEmpty() && toFind.equals("\n")) { return matchStart; } ++end; } else if (tokenStack.isEmpty() && matchText.equals(toFind)) { // The normal return.... + if (insideFunction) { + addFunctionArg(insideFunction, matchStart); + } return matcher.end(); } else if (matchText.equals("[[") || matchText.equals("{{")) { tokenStack.add(matchText); @@ -159,48 +376,115 @@ public final class WikiTokenizer { if (tokenStack.size() > 0) { final String removed = tokenStack.remove(tokenStack.size() - 1); if (removed.equals("{{") && !matcher.group().equals("}}")) { - System.err.println("Unmatched {{ error: " + wikiText.substring(start)); + errors.add("Unmatched {{ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } else if (removed.equals("[[") && !matcher.group().equals("]]")) { - System.err.println("Unmatched [[ error: " + wikiText.substring(start)); + errors.add("Unmatched [[ error: " + wikiText.substring(start)); return safeIndexOf(wikiText, start, "\n", "\n"); } } else { - System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n")); + errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n")); // If we were looking for a newline return safeIndexOf(wikiText, start, "\n", "\n"); } + } else if (matchText.equals("|")) { + if (tokenStack.isEmpty()) { + addFunctionArg(insideFunction, matchStart); + } + } else if (matchText.equals("=")) { + if (tokenStack.isEmpty()) { + lastUnescapedEqualsPos = matchStart; + } + // Do nothing. These can match spuriously, and if it's not the thing + // we're looking for, keep on going. } else if (matchText.equals(""); if (end == -1) { - System.err.println("Unmatched