1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.LinkedHashMap;
19 import java.util.List;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
24 public final class WikiTokenizer {
26 public static interface Callback {
27 void onPlainText(final String text);
28 void onMarkup(WikiTokenizer wikiTokenizer);
29 void onWikiLink(WikiTokenizer wikiTokenizer);
30 void onNewline(WikiTokenizer wikiTokenizer);
31 void onFunction(final WikiTokenizer tokenizer, String functionName, List<String> functionPositionArgs,
32 Map<String, String> functionNamedArgs);
33 void onHeading(WikiTokenizer wikiTokenizer);
34 void onListItem(WikiTokenizer wikiTokenizer);
35 void onComment(WikiTokenizer wikiTokenizer);
36 void onHtml(WikiTokenizer wikiTokenizer);
39 public static class DoNothingCallback implements Callback {
42 public void onPlainText(String text) {
46 public void onMarkup(WikiTokenizer wikiTokenizer) {
50 public void onWikiLink(WikiTokenizer wikiTokenizer) {
54 public void onNewline(WikiTokenizer wikiTokenizer) {
58 public void onFunction(WikiTokenizer tokenizer, String functionName,
59 List<String> functionPositionArgs, Map<String, String> functionNamedArgs) {
63 public void onHeading(WikiTokenizer wikiTokenizer) {
67 public void onListItem(WikiTokenizer wikiTokenizer) {
71 public void onComment(WikiTokenizer wikiTokenizer) {
75 public void onHtml(WikiTokenizer wikiTokenizer) {
79 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
80 private static final Pattern wikiTokenEvent = Pattern.compile("(" +
83 "\\||" + // Need the | because we might have to find unescaped pipes
84 "=|" + // Need the = because we might have to find unescaped =
87 "$)", Pattern.MULTILINE);
88 private static final String listChars = "*#:;";
91 final String wikiText;
92 final Matcher matcher;
94 boolean justReturnedNewline = true;
95 int lastLineStart = 0;
99 final List<String> errors = new ArrayList<String>();
100 final List<String> tokenStack = new ArrayList<String>();
103 private String headingWikiText;
104 private int headingDepth;
105 private int listPrefixEnd;
106 private boolean isPlainText;
107 private boolean isMarkup;
108 private boolean isComment;
109 private boolean isFunction;
110 private boolean isWikiLink;
111 private boolean isHtml;
112 private int firstUnescapedPipePos;
114 private int lastUnescapedPipePos;
115 private int lastUnescapedEqualsPos;
116 private final List<String> positionArgs = new ArrayList<String>();
117 private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
120 public WikiTokenizer(final String wikiText) {
121 this(wikiText, true);
124 public WikiTokenizer(final String wikiText, final boolean isNewline) {
125 this.wikiText = wikiText.replaceAll("\u2028", "\n");
126 this.matcher = wikiTokenEvent.matcher(wikiText);
127 justReturnedNewline = isNewline;
130 private void clear() {
134 headingWikiText = null;
144 firstUnescapedPipePos = -1;
145 lastUnescapedPipePos = -1;
146 lastUnescapedEqualsPos = -1;
147 positionArgs.clear();
151 private static final Pattern POSSIBLE_WIKI_TEXT = Pattern.compile(
159 public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
161 if (!POSSIBLE_WIKI_TEXT.matcher(wikiText).find()) {
162 callback.onPlainText(wikiText);
164 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
165 while (tokenizer.nextToken() != null) {
166 if (tokenizer.isPlainText()) {
167 callback.onPlainText(tokenizer.token());
168 } else if (tokenizer.isMarkup()) {
169 callback.onMarkup(tokenizer);
170 } else if (tokenizer.isWikiLink) {
171 callback.onWikiLink(tokenizer);
172 } else if (tokenizer.isNewline()) {
173 callback.onNewline(tokenizer);
174 } else if (tokenizer.isFunction()) {
175 callback.onFunction(tokenizer, tokenizer.functionName(), tokenizer.functionPositionArgs(), tokenizer.functionNamedArgs());
176 } else if (tokenizer.isHeading()) {
177 callback.onHeading(tokenizer);
178 } else if (tokenizer.isListItem()) {
179 callback.onListItem(tokenizer);
180 } else if (tokenizer.isComment()) {
181 callback.onComment(tokenizer);
182 } else if (tokenizer.isHtml()) {
183 callback.onHtml(tokenizer);
184 } else if (!tokenizer.errors.isEmpty()) {
185 // Log was already printed....
187 throw new IllegalStateException("Unknown wiki state: " + tokenizer.token());
193 public List<String> errors() {
197 public boolean isNewline() {
198 return justReturnedNewline;
201 public void returnToLineStart() {
202 end = start = lastLineStart;
203 justReturnedNewline = true;
206 public boolean isHeading() {
207 return headingWikiText != null;
210 public String headingWikiText() {
212 return headingWikiText;
215 public int headingDepth() {
220 public boolean isMarkup() {
224 public boolean isComment() {
228 public boolean isListItem() {
229 return listPrefixEnd != -1;
232 public String listItemPrefix() {
234 return wikiText.substring(start, listPrefixEnd);
237 public static String getListTag(char c) {
244 public String listItemWikiText() {
246 return wikiText.substring(listPrefixEnd, end);
249 public boolean isFunction() {
253 public String functionName() {
256 if (firstUnescapedPipePos != -1) {
257 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
259 return trimNewlines(wikiText.substring(start + 2, end - 2).trim());
262 public List<String> functionPositionArgs() {
266 public Map<String, String> functionNamedArgs() {
270 public boolean isPlainText() {
274 public boolean isWikiLink() {
278 public String wikiLinkText() {
281 if (lastUnescapedPipePos != -1) {
282 return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2));
284 assert start + 2 < wikiText.length() && end >= 2: wikiText;
285 return trimNewlines(wikiText.substring(start + 2, end - 2));
288 public String wikiLinkDest() {
291 if (firstUnescapedPipePos != -1) {
292 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos));
297 public boolean isHtml() {
301 public boolean remainderStartsWith(final String prefix) {
302 return wikiText.startsWith(prefix, start);
305 public void nextLine() {
306 final int oldStart = start;
307 while(nextToken() != null && !isNewline()) {}
315 public WikiTokenizer nextToken() {
320 if (justReturnedNewline) {
321 lastLineStart = start;
326 final int len = wikiText.length();
331 // Eat a newline if we're looking at one:
332 final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
334 justReturnedNewline = true;
339 if (justReturnedNewline) {
340 justReturnedNewline = false;
342 final char firstChar = wikiText.charAt(end);
343 if (firstChar == '=') {
344 final int headerStart = end;
346 while (++end < len && wikiText.charAt(end) == '=') {}
347 final int headerTitleStart = end;
348 headingDepth = headerTitleStart - headerStart;
351 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
352 final int closingEquals = escapedFindEnd(end, "=");
353 if (wikiText.charAt(closingEquals - 1) == '=') {
354 end = closingEquals - 1;
359 final int headerTitleEnd = end;
360 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
362 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
363 final int headerEnd = end;
364 if (headerEnd - headerTitleEnd != headingDepth) {
365 errors.add("Mismatched header depth: " + token());
369 if (listChars.indexOf(firstChar) != -1) {
370 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
372 end = escapedFindEnd(start, "\n");
377 if (wikiText.startsWith("'''", start)) {
383 if (wikiText.startsWith("''", start)) {
389 if (wikiText.startsWith("[[", start)) {
390 end = escapedFindEnd(start + 2, "]]");
391 isWikiLink = errors.isEmpty();
395 if (wikiText.startsWith("{{", start)) {
396 end = escapedFindEnd(start + 2, "}}");
397 isFunction = errors.isEmpty();
401 if (wikiText.startsWith("<pre>", start)) {
402 end = safeIndexOf(wikiText, start, "</pre>", "\n");
407 if (wikiText.startsWith("<math>", start)) {
408 end = safeIndexOf(wikiText, start, "</math>", "\n");
413 if (wikiText.startsWith("<!--", start)) {
415 end = safeIndexOf(wikiText, start, "-->", "\n");
419 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
420 errors.add("Close without open!");
425 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
432 if (this.matcher.find(start)) {
433 end = this.matcher.start(1);
436 errors.add("Empty group: " + this.matcher.group());
442 end = wikiText.length();
446 if (!errors.isEmpty()) {
447 System.err.println("Errors: " + errors + ", token=" + token());
453 public String token() {
454 final String token = wikiText.substring(start, end);
455 assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
459 private int escapedFindEnd(final int start, final String toFind) {
460 assert tokenStack.isEmpty();
462 final boolean insideFunction = toFind.equals("}}");
465 int firstNewline = -1;
466 while (end < wikiText.length()) {
467 if (matcher.find(end)) {
468 final String matchText = matcher.group();
469 final int matchStart = matcher.start();
471 assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
472 if (matchText.length() == 0) {
473 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
474 if (firstNewline == -1) {
475 firstNewline = matcher.end();
477 if (tokenStack.isEmpty() && toFind.equals("\n")) {
481 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
482 // The normal return....
483 if (insideFunction) {
484 addFunctionArg(insideFunction, matchStart);
486 return matcher.end();
487 } else if (matchText.equals("[[") || matchText.equals("{{")) {
488 tokenStack.add(matchText);
489 } else if (matchText.equals("]]") || matchText.equals("}}")) {
490 if (tokenStack.size() > 0) {
491 final String removed = tokenStack.remove(tokenStack.size() - 1);
492 if (removed.equals("{{") && !matcher.group().equals("}}")) {
493 errors.add("Unmatched {{ error: " + wikiText.substring(start));
494 return safeIndexOf(wikiText, start, "\n", "\n");
495 } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
496 errors.add("Unmatched [[ error: " + wikiText.substring(start));
497 return safeIndexOf(wikiText, start, "\n", "\n");
500 errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n"));
501 // If we were looking for a newline
502 return safeIndexOf(wikiText, start, "\n", "\n");
504 } else if (matchText.equals("|")) {
505 if (tokenStack.isEmpty()) {
506 addFunctionArg(insideFunction, matchStart);
508 } else if (matchText.equals("=")) {
509 if (tokenStack.isEmpty()) {
510 lastUnescapedEqualsPos = matchStart;
512 // Do nothing. These can match spuriously, and if it's not the thing
513 // we're looking for, keep on going.
514 } else if (matchText.equals("<!--")) {
515 end = wikiText.indexOf("-->");
517 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
518 return safeIndexOf(wikiText, start, "\n", "\n");
520 } else if (matchText.equals("''")) {
523 assert false : "Match text='" + matchText + "'";
524 throw new IllegalStateException();
527 // Hmmm, we didn't find the closing symbol we were looking for...
528 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
529 return safeIndexOf(wikiText, start, "\n", "\n");
532 // Inside the while loop. Just go forward.
533 end = Math.max(end, matcher.end());
535 if (toFind.equals("\n") && tokenStack.isEmpty()) {
536 // We were looking for the end, we got it.
539 if (firstNewline != -1) {
540 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
546 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
547 if (firstUnescapedPipePos == -1) {
548 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
549 } else if (insideFunction) {
550 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
551 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
552 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
553 namedArgs.put(trimNewlines(key), trimNewlines(value));
555 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
556 positionArgs.add(trimNewlines(value));
559 lastUnescapedPipePos = matchStart;
562 static final String trimNewlines(String s) {
563 while (s.startsWith("\n")) {
566 while (s.endsWith("\n")) {
567 s = s.substring(0, s.length() - 1);
569 return s.replaceAll("\n", " ");
572 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
573 int close = s.indexOf(target, start);
575 // Don't step over a \n.
576 return close + (target.equals("\n") ? 0 : target.length());
578 close = s.indexOf(backup, start);
580 return close + (backup.equals("\n") ? 0 : backup.length());
585 public static String toPlainText(final String wikiText) {
586 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
587 final StringBuilder builder = new StringBuilder();
588 while (wikiTokenizer.nextToken() != null) {
589 if (wikiTokenizer.isPlainText()) {
590 builder.append(wikiTokenizer.token());
591 } else if (wikiTokenizer.isWikiLink()) {
592 builder.append(wikiTokenizer.wikiLinkText());
593 } else if (wikiTokenizer.isNewline()) {
594 builder.append("\n");
595 } else if (wikiTokenizer.isFunction()) {
596 builder.append(wikiTokenizer.token());
599 return builder.toString();
602 public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
603 final Map<String, String> namedArgs) {
604 builder.append(name);
605 for (final String arg : args) {
606 builder.append("|").append(arg);
608 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
609 builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());