1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.LinkedHashMap;
19 import java.util.List;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
24 public final class WikiTokenizer {
26 public static interface Callback {
27 void onPlainText(final String text);
28 void onMarkup(WikiTokenizer wikiTokenizer);
29 void onWikiLink(WikiTokenizer wikiTokenizer);
30 void onNewline(WikiTokenizer wikiTokenizer);
31 void onFunction(final WikiTokenizer tokenizer, String functionName, List<String> functionPositionArgs,
32 Map<String, String> functionNamedArgs);
33 void onHeading(WikiTokenizer wikiTokenizer);
34 void onListItem(WikiTokenizer wikiTokenizer);
35 void onComment(WikiTokenizer wikiTokenizer);
36 void onHtml(WikiTokenizer wikiTokenizer);
39 public static class DoNothingCallback implements Callback {
42 public void onPlainText(String text) {
46 public void onMarkup(WikiTokenizer wikiTokenizer) {
50 public void onWikiLink(WikiTokenizer wikiTokenizer) {
54 public void onNewline(WikiTokenizer wikiTokenizer) {
58 public void onFunction(WikiTokenizer tokenizer, String functionName,
59 List<String> functionPositionArgs, Map<String, String> functionNamedArgs) {
63 public void onHeading(WikiTokenizer wikiTokenizer) {
67 public void onListItem(WikiTokenizer wikiTokenizer) {
71 public void onComment(WikiTokenizer wikiTokenizer) {
75 public void onHtml(WikiTokenizer wikiTokenizer) {
79 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
80 private static final Pattern wikiTokenEvent = Pattern.compile("(" +
83 "\\||" + // Need the | because we might have to find unescaped pipes
84 "=|" + // Need the = because we might have to find unescaped =
90 "$)", Pattern.MULTILINE);
91 private static final String listChars = "*#:;";
94 final String wikiText;
95 final Matcher matcher;
97 boolean justReturnedNewline = true;
98 int lastLineStart = 0;
102 final List<String> errors = new ArrayList<String>();
103 final List<String> tokenStack = new ArrayList<String>();
106 private String headingWikiText;
107 private int headingDepth;
108 private int listPrefixEnd;
109 private boolean isPlainText;
110 private boolean isMarkup;
111 private boolean isComment;
112 private boolean isFunction;
113 private boolean isWikiLink;
114 private boolean isHtml;
115 private int firstUnescapedPipePos;
117 private int lastUnescapedPipePos;
118 private int lastUnescapedEqualsPos;
119 private final List<String> positionArgs = new ArrayList<String>();
120 private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
123 public WikiTokenizer(final String wikiText) {
124 this(wikiText, true);
127 public WikiTokenizer(String wikiText, final boolean isNewline) {
128 wikiText = wikiText.replace('\u2028', '\n');
129 wikiText = wikiText.replace('\u0085', '\n');
130 this.wikiText = wikiText;
131 this.matcher = wikiTokenEvent.matcher(wikiText);
132 justReturnedNewline = isNewline;
135 private void clear() {
139 headingWikiText = null;
149 firstUnescapedPipePos = -1;
150 lastUnescapedPipePos = -1;
151 lastUnescapedEqualsPos = -1;
152 positionArgs.clear();
156 private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
167 public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
169 if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
170 callback.onPlainText(wikiText);
172 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
173 while (tokenizer.nextToken() != null) {
174 if (tokenizer.isPlainText()) {
175 callback.onPlainText(tokenizer.token());
176 } else if (tokenizer.isMarkup()) {
177 callback.onMarkup(tokenizer);
178 } else if (tokenizer.isWikiLink()) {
179 callback.onWikiLink(tokenizer);
180 } else if (tokenizer.isNewline()) {
181 callback.onNewline(tokenizer);
182 } else if (tokenizer.isFunction()) {
183 callback.onFunction(tokenizer, tokenizer.functionName(), tokenizer.functionPositionArgs(), tokenizer.functionNamedArgs());
184 } else if (tokenizer.isHeading()) {
185 callback.onHeading(tokenizer);
186 } else if (tokenizer.isListItem()) {
187 callback.onListItem(tokenizer);
188 } else if (tokenizer.isComment()) {
189 callback.onComment(tokenizer);
190 } else if (tokenizer.isHtml()) {
191 callback.onHtml(tokenizer);
192 } else if (!tokenizer.errors.isEmpty()) {
193 // Log was already printed....
195 throw new IllegalStateException("Unknown wiki state: " + tokenizer.token());
201 public List<String> errors() {
205 public boolean isNewline() {
206 return justReturnedNewline;
209 public void returnToLineStart() {
210 end = start = lastLineStart;
211 justReturnedNewline = true;
214 public boolean isHeading() {
215 return headingWikiText != null;
218 public String headingWikiText() {
220 return headingWikiText;
223 public int headingDepth() {
228 public boolean isMarkup() {
232 public boolean isComment() {
236 public boolean isListItem() {
237 return listPrefixEnd != -1;
240 public String listItemPrefix() {
242 return wikiText.substring(start, listPrefixEnd);
245 public static String getListTag(char c) {
252 public String listItemWikiText() {
254 return wikiText.substring(listPrefixEnd, end);
257 public boolean isFunction() {
261 public String functionName() {
264 if (firstUnescapedPipePos != -1) {
265 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
267 final int safeEnd = Math.max(start + 2, end - 2);
268 return trimNewlines(wikiText.substring(start + 2, safeEnd).trim());
271 public List<String> functionPositionArgs() {
275 public Map<String, String> functionNamedArgs() {
279 public boolean isPlainText() {
283 public boolean isWikiLink() {
287 public String wikiLinkText() {
290 if (lastUnescapedPipePos != -1) {
291 return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2));
293 assert start + 2 < wikiText.length() && end >= 2: wikiText;
294 return trimNewlines(wikiText.substring(start + 2, end - 2));
297 public String wikiLinkDest() {
300 if (firstUnescapedPipePos != -1) {
301 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos));
306 public boolean isHtml() {
310 public boolean remainderStartsWith(final String prefix) {
311 return wikiText.startsWith(prefix, start);
314 public void nextLine() {
315 final int oldStart = start;
316 while(nextToken() != null && !isNewline()) {}
324 public WikiTokenizer nextToken() {
329 if (justReturnedNewline) {
330 lastLineStart = start;
335 final int len = wikiText.length();
340 // Eat a newline if we're looking at one:
341 final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
343 justReturnedNewline = true;
348 if (justReturnedNewline) {
349 justReturnedNewline = false;
351 final char firstChar = wikiText.charAt(end);
352 if (firstChar == '=') {
353 final int headerStart = end;
355 while (++end < len && wikiText.charAt(end) == '=') {}
356 final int headerTitleStart = end;
357 headingDepth = headerTitleStart - headerStart;
360 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
361 final int closingEquals = escapedFindEnd(end, "=");
362 if (wikiText.charAt(closingEquals - 1) == '=') {
363 end = closingEquals - 1;
368 final int headerTitleEnd = end;
369 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
371 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
372 final int headerEnd = end;
373 if (headerEnd - headerTitleEnd != headingDepth) {
374 errors.add("Mismatched header depth: " + token());
378 if (listChars.indexOf(firstChar) != -1) {
379 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
381 end = escapedFindEnd(start, "\n");
386 if (wikiText.startsWith("'''", start)) {
392 if (wikiText.startsWith("''", start)) {
398 if (wikiText.startsWith("[[", start)) {
399 end = escapedFindEnd(start + 2, "]]");
400 isWikiLink = errors.isEmpty();
404 if (wikiText.startsWith("{{", start)) {
405 end = escapedFindEnd(start + 2, "}}");
406 isFunction = errors.isEmpty();
410 if (wikiText.startsWith("<pre>", start)) {
411 end = safeIndexOf(wikiText, start, "</pre>", "\n");
416 if (wikiText.startsWith("<ref>", start)) {
417 end = safeIndexOf(wikiText, start, "</ref>", "\n");
422 if (wikiText.startsWith("<math>", start)) {
423 end = safeIndexOf(wikiText, start, "</math>", "\n");
428 if (wikiText.startsWith("<!--", start)) {
430 end = safeIndexOf(wikiText, start, "-->", "\n");
434 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
435 errors.add("Close without open!");
440 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
447 if (this.matcher.find(start)) {
448 end = this.matcher.start(1);
451 errors.add("Empty group: " + this.matcher.group());
457 end = wikiText.length();
461 if (!errors.isEmpty()) {
462 System.err.println("Errors: " + errors + ", token=" + token());
468 public String token() {
469 final String token = wikiText.substring(start, end);
470 assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
474 final static String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
475 private int escapedFindEnd(final int start, final String toFind) {
476 assert tokenStack.isEmpty();
478 final boolean insideFunction = toFind.equals("}}");
481 int firstNewline = -1;
482 int[] nextMatch = new int[patterns.length];
483 for (int i = 0; i < nextMatch.length; ++i) {
486 int singleBrackets = 0;
487 while (end < wikiText.length()) {
488 // Manual replacement for matcher.find(end),
489 // because Java regexp is a ridiculously slow implementation.
490 // Initialize to always match the end.
492 for (int i = 0; i < nextMatch.length; ++i) {
493 if (nextMatch[i] <= end) {
494 nextMatch[i] = wikiText.indexOf(patterns[i], end);
495 if (nextMatch[i] == -1) nextMatch[i] = i > 0 ? 0x7fffffff : wikiText.length();
497 if (nextMatch[i] < nextMatch[matchIdx]) {
502 int matchStart = nextMatch[matchIdx];
503 String matchText = patterns[matchIdx];
504 int matchEnd = matchStart + matchText.length();
507 matchEnd = matchStart;
510 assert matchEnd > end || matchText.length() == 0: "Group=" + matchText;
511 if (matchText.length() == 0) {
512 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
513 if (firstNewline == -1) {
514 firstNewline = matchEnd;
516 if (tokenStack.isEmpty() && toFind.equals("\n")) {
520 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
521 // The normal return....
522 if (insideFunction) {
523 addFunctionArg(insideFunction, matchStart);
526 } else if (matchText.equals("[")) {
528 } else if (matchText.equals("]")) {
529 if (singleBrackets > 0) singleBrackets--;
530 } else if (matchText.equals("[[") || matchText.equals("{{")) {
531 tokenStack.add(matchText);
532 } else if (matchText.equals("]]") || matchText.equals("}}")) {
533 if (tokenStack.size() > 0) {
534 final String removed = tokenStack.remove(tokenStack.size() - 1);
535 if (removed.equals("{{") && !matchText.equals("}}")) {
536 if (singleBrackets >= 2) { // assume this is really two closing single ]
538 tokenStack.add(removed);
540 errors.add("Unmatched {{ error: " + wikiText.substring(start, matchEnd));
541 return safeIndexOf(wikiText, start, "\n", "\n");
543 } else if (removed.equals("[[") && !matchText.equals("]]")) {
544 errors.add("Unmatched [[ error: " + wikiText.substring(start, matchEnd));
545 return safeIndexOf(wikiText, start, "\n", "\n");
548 errors.add("Pop too many " + matchText + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
549 // If we were looking for a newline
550 return safeIndexOf(wikiText, start, "\n", "\n");
552 } else if (matchText.equals("|")) {
553 if (tokenStack.isEmpty()) {
554 addFunctionArg(insideFunction, matchStart);
556 } else if (matchText.equals("=")) {
557 if (tokenStack.isEmpty()) {
558 lastUnescapedEqualsPos = matchStart;
560 // Do nothing. These can match spuriously, and if it's not the thing
561 // we're looking for, keep on going.
562 } else if (matchText.equals("<!--")) {
563 end = wikiText.indexOf("-->", matchStart);
565 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
566 return safeIndexOf(wikiText, start, "\n", "\n");
568 } else if (matchText.equals("''") || (matchText.startsWith("<") && matchText.endsWith(">"))) {
571 assert false : "Match text='" + matchText + "'";
572 throw new IllegalStateException();
575 // Inside the while loop. Just go forward.
576 end = Math.max(end, matchEnd);
578 if (toFind.equals("\n") && tokenStack.isEmpty()) {
579 // We were looking for the end, we got it.
582 errors.add("Couldn't find: " + (toFind.equals("\n") ? "newline" : toFind) + ", "+ wikiText.substring(start));
583 if (firstNewline != -1) {
589 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
590 if (firstUnescapedPipePos == -1) {
591 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
592 } else if (insideFunction) {
593 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
594 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
595 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
596 namedArgs.put(trimNewlines(key), trimNewlines(value));
598 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
599 positionArgs.add(trimNewlines(value));
602 lastUnescapedPipePos = matchStart;
605 static final String trimNewlines(String s) {
606 while (s.startsWith("\n")) {
609 while (s.endsWith("\n")) {
610 s = s.substring(0, s.length() - 1);
612 return s.replace('\n', ' ');
615 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
616 int close = s.indexOf(target, start);
618 // Don't step over a \n.
619 return close + (target.equals("\n") ? 0 : target.length());
621 close = s.indexOf(backup, start);
623 return close + (backup.equals("\n") ? 0 : backup.length());
628 public static String toPlainText(final String wikiText) {
629 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
630 final StringBuilder builder = new StringBuilder();
631 while (wikiTokenizer.nextToken() != null) {
632 if (wikiTokenizer.isPlainText()) {
633 builder.append(wikiTokenizer.token());
634 } else if (wikiTokenizer.isWikiLink()) {
635 builder.append(wikiTokenizer.wikiLinkText());
636 } else if (wikiTokenizer.isNewline()) {
637 builder.append("\n");
638 } else if (wikiTokenizer.isFunction()) {
639 builder.append(wikiTokenizer.token());
642 return builder.toString();
645 public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
646 final Map<String, String> namedArgs) {
647 builder.append(name);
648 for (final String arg : args) {
649 builder.append("|").append(arg);
651 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
652 builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());