1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.LinkedHashMap;
19 import java.util.List;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
24 public final class WikiTokenizer {
26 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
27 private static final Pattern wikiTokenEvent = Pattern.compile("(" +
30 "\\||" + // Need the | because we might have to find unescaped pipes
31 "=|" + // Need the = because we might have to find unescaped =
34 "$)", Pattern.MULTILINE);
35 private static final String listChars = "*#:;";
38 final String wikiText;
39 final Matcher matcher;
41 boolean justReturnedNewline = true;
42 int lastLineStart = 0;
46 final List<String> errors = new ArrayList<String>();
47 final List<String> tokenStack = new ArrayList<String>();
50 private String headingWikiText;
51 private int headingDepth;
52 private int listPrefixEnd;
53 private boolean isPlainText;
54 private boolean isMarkup;
55 private boolean isComment;
56 private boolean isFunction;
57 private boolean isWikiLink;
58 private int firstUnescapedPipePos;
60 private int lastUnescapedPipePos;
61 private int lastUnescapedEqualsPos;
62 private final List<String> positionArgs = new ArrayList<String>();
63 private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
66 public WikiTokenizer(final String wikiText) {
70 public WikiTokenizer(final String wikiText, final boolean isNewline) {
71 this.wikiText = wikiText;
72 this.matcher = wikiTokenEvent.matcher(wikiText);
73 justReturnedNewline = isNewline;
76 private void clear() {
80 headingWikiText = null;
89 firstUnescapedPipePos = -1;
90 lastUnescapedPipePos = -1;
91 lastUnescapedEqualsPos = -1;
96 public boolean isNewline() {
97 return justReturnedNewline;
100 public void returnToLineStart() {
101 end = start = lastLineStart;
102 justReturnedNewline = true;
105 public boolean isHeading() {
106 return headingWikiText != null;
109 public String headingWikiText() {
111 return headingWikiText;
114 public int headingDepth() {
119 public boolean isMarkup() {
123 public boolean isComment() {
127 public boolean isListItem() {
128 return listPrefixEnd != -1;
131 public String listItemPrefix() {
133 return wikiText.substring(start, listPrefixEnd);
136 public String listItemWikiText() {
138 return wikiText.substring(listPrefixEnd, end);
141 public boolean isFunction() {
145 public String functionName() {
148 if (firstUnescapedPipePos != -1) {
149 return wikiText.substring(start + 2, firstUnescapedPipePos);
151 return wikiText.substring(start + 2, end - 2);
154 public List<String> functionPositionArgs() {
158 public Map<String, String> functionNamedArgs() {
162 public boolean isPlainText() {
166 public boolean isWikiLink() {
170 public String wikiLinkText() {
173 if (lastUnescapedPipePos != -1) {
174 return wikiText.substring(lastUnescapedPipePos + 1, end - 2);
176 assert start + 2 < wikiText.length() && end >= 2: wikiText;
177 return wikiText.substring(start + 2, end - 2);
180 public String wikiLinkDest() {
183 if (firstUnescapedPipePos != -1) {
184 return wikiText.substring(start + 2, firstUnescapedPipePos);
189 public boolean remainderStartsWith(final String prefix) {
190 return wikiText.startsWith(prefix, start);
193 public void nextLine() {
194 final int oldStart = start;
195 while(nextToken() != null && !isNewline()) {}
203 public WikiTokenizer nextToken() {
208 if (justReturnedNewline) {
209 lastLineStart = start;
214 final int len = wikiText.length();
219 // Eat a newline if we're looking at one:
220 final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
222 justReturnedNewline = true;
227 if (justReturnedNewline) {
228 justReturnedNewline = false;
230 final char firstChar = wikiText.charAt(end);
231 if (firstChar == '=') {
232 final int headerStart = end;
234 while (++end < len && wikiText.charAt(end) == '=') {}
235 final int headerTitleStart = end;
236 headingDepth = headerTitleStart - headerStart;
239 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
240 final int closingEquals = escapedFindEnd(end, "=");
241 if (wikiText.charAt(closingEquals - 1) == '=') {
242 end = closingEquals - 1;
247 final int headerTitleEnd = end;
248 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
250 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
251 final int headerEnd = end;
252 if (headerEnd - headerTitleEnd != headingDepth) {
253 errors.add("Mismatched header depth: " + token());
257 if (listChars.indexOf(firstChar) != -1) {
258 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
260 end = escapedFindEnd(start, "\n");
265 if (wikiText.startsWith("'''", start)) {
271 if (wikiText.startsWith("''", start)) {
277 if (wikiText.startsWith("[[", start)) {
278 end = escapedFindEnd(start + 2, "]]");
279 isWikiLink = errors.isEmpty();
283 if (wikiText.startsWith("{{", start)) {
284 end = escapedFindEnd(start + 2, "}}");
285 isFunction = errors.isEmpty();
289 if (wikiText.startsWith("<pre>", start)) {
290 end = safeIndexOf(wikiText, start, "</pre>", "\n");
294 if (wikiText.startsWith("<math>", start)) {
295 end = safeIndexOf(wikiText, start, "</math>", "\n");
299 if (wikiText.startsWith("<!--", start)) {
301 end = safeIndexOf(wikiText, start, "-->", "\n");
305 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
306 errors.add("Close without open!");
311 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
318 if (this.matcher.find(start)) {
319 end = this.matcher.start(1);
322 errors.add("Empty group: " + this.matcher.group());
328 end = wikiText.length();
332 if (!errors.isEmpty()) {
333 System.err.println("Errors: " + errors + ", token=" + token());
339 public String token() {
340 final String token = wikiText.substring(start, end);
341 assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
345 private int escapedFindEnd(final int start, final String toFind) {
346 assert tokenStack.isEmpty();
348 final boolean insideFunction = toFind.equals("}}");
351 int firstNewline = -1;
352 while (end < wikiText.length()) {
353 if (matcher.find(end)) {
354 final String matchText = matcher.group();
355 final int matchStart = matcher.start();
357 assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
358 if (matchText.length() == 0) {
359 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
360 if (firstNewline == -1) {
361 firstNewline = matcher.end();
363 if (tokenStack.isEmpty() && toFind.equals("\n")) {
367 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
368 // The normal return....
369 if (insideFunction) {
370 addFunctionArg(insideFunction, matchStart);
372 return matcher.end();
373 } else if (matchText.equals("[[") || matchText.equals("{{")) {
374 tokenStack.add(matchText);
375 } else if (matchText.equals("]]") || matchText.equals("}}")) {
376 if (tokenStack.size() > 0) {
377 final String removed = tokenStack.remove(tokenStack.size() - 1);
378 if (removed.equals("{{") && !matcher.group().equals("}}")) {
379 errors.add("Unmatched {{ error: " + wikiText.substring(start));
380 return safeIndexOf(wikiText, start, "\n", "\n");
381 } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
382 errors.add("Unmatched [[ error: " + wikiText.substring(start));
383 return safeIndexOf(wikiText, start, "\n", "\n");
386 errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n"));
387 // If we were looking for a newline
388 return safeIndexOf(wikiText, start, "\n", "\n");
390 } else if (matchText.equals("|")) {
391 if (tokenStack.isEmpty()) {
392 addFunctionArg(insideFunction, matchStart);
394 } else if (matchText.equals("=")) {
395 if (tokenStack.isEmpty()) {
396 lastUnescapedEqualsPos = matchStart;
398 // Do nothing. These can match spuriously, and if it's not the thing
399 // we're looking for, keep on going.
400 } else if (matchText.equals("<!--")) {
401 end = wikiText.indexOf("-->");
403 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
404 return safeIndexOf(wikiText, start, "\n", "\n");
406 } else if (matchText.equals("''")) {
409 assert false : "Match text='" + matchText + "'";
410 throw new IllegalStateException();
413 // Hmmm, we didn't find the closing symbol we were looking for...
414 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
415 return safeIndexOf(wikiText, start, "\n", "\n");
418 // Inside the while loop. Just go forward.
419 end = Math.max(end, matcher.end());
421 if (toFind.equals("\n") && tokenStack.isEmpty()) {
422 // We were looking for the end, we got it.
425 if (firstNewline != -1) {
426 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
432 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
433 if (firstUnescapedPipePos == -1) {
434 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
435 } else if (insideFunction) {
436 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
437 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
438 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
439 namedArgs.put(key, value);
441 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
442 positionArgs.add(value);
445 lastUnescapedPipePos = matchStart;
448 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
449 int close = s.indexOf(target, start);
451 // Don't step over a \n.
452 return close + (target.equals("\n") ? 0 : target.length());
454 close = s.indexOf(backup, start);
456 return close + (backup.equals("\n") ? 0 : backup.length());
461 public static String toPlainText(final String wikiText) {
462 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
463 final StringBuilder builder = new StringBuilder();
464 while (wikiTokenizer.nextToken() != null) {
465 if (wikiTokenizer.isPlainText()) {
466 builder.append(wikiTokenizer.token());
467 } else if (wikiTokenizer.isWikiLink()) {
468 builder.append(wikiTokenizer.wikiLinkText());
469 } else if (wikiTokenizer.isNewline()) {
470 builder.append("\n");
471 } else if (wikiTokenizer.isFunction()) {
472 builder.append(wikiTokenizer.token());
475 return builder.toString();