1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.LinkedHashMap;
19 import java.util.List;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
24 public final class WikiTokenizer {
26 public static interface Callback {
27 void onPlainText(WikiTokenizer wikiTokenizer);
28 void onMarkup(WikiTokenizer wikiTokenizer);
29 void onWikiLink(WikiTokenizer wikiTokenizer);
30 void onNewline(WikiTokenizer wikiTokenizer);
31 void onFunction(String functionName, List<String> functionPositionArgs,
32 Map<String, String> functionNamedArgs);
33 void onHeading(WikiTokenizer wikiTokenizer);
34 void onListItem(WikiTokenizer wikiTokenizer);
35 void onComment(WikiTokenizer wikiTokenizer);
38 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
39 private static final Pattern wikiTokenEvent = Pattern.compile("(" +
42 "\\||" + // Need the | because we might have to find unescaped pipes
43 "=|" + // Need the = because we might have to find unescaped =
46 "$)", Pattern.MULTILINE);
47 private static final String listChars = "*#:;";
50 final String wikiText;
51 final Matcher matcher;
53 boolean justReturnedNewline = true;
54 int lastLineStart = 0;
58 final List<String> errors = new ArrayList<String>();
59 final List<String> tokenStack = new ArrayList<String>();
62 private String headingWikiText;
63 private int headingDepth;
64 private int listPrefixEnd;
65 private boolean isPlainText;
66 private boolean isMarkup;
67 private boolean isComment;
68 private boolean isFunction;
69 private boolean isWikiLink;
70 private int firstUnescapedPipePos;
72 private int lastUnescapedPipePos;
73 private int lastUnescapedEqualsPos;
74 private final List<String> positionArgs = new ArrayList<String>();
75 private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
78 public WikiTokenizer(final String wikiText) {
82 public WikiTokenizer(final String wikiText, final boolean isNewline) {
83 this.wikiText = wikiText;
84 this.matcher = wikiTokenEvent.matcher(wikiText);
85 justReturnedNewline = isNewline;
88 private void clear() {
92 headingWikiText = null;
101 firstUnescapedPipePos = -1;
102 lastUnescapedPipePos = -1;
103 lastUnescapedEqualsPos = -1;
104 positionArgs.clear();
108 public void dispatch(final Callback callback) {
109 while (nextToken() != null) {
111 callback.onPlainText(this);
112 } else if (isMarkup()) {
113 callback.onMarkup(this);
114 } else if (isWikiLink) {
115 callback.onWikiLink(this);
116 } else if (isNewline()) {
117 callback.onNewline(this);
118 } else if (isFunction()) {
119 callback.onFunction(functionName(), functionPositionArgs(), functionNamedArgs());
120 } else if (isHeading()) {
121 callback.onHeading(this);
122 } else if (isListItem()) {
123 callback.onListItem(this);
124 } else if (isComment()) {
125 callback.onComment(this);
127 throw new IllegalStateException("Unknown wiki state.");
132 public boolean isNewline() {
133 return justReturnedNewline;
136 public void returnToLineStart() {
137 end = start = lastLineStart;
138 justReturnedNewline = true;
141 public boolean isHeading() {
142 return headingWikiText != null;
145 public String headingWikiText() {
147 return headingWikiText;
150 public int headingDepth() {
155 public boolean isMarkup() {
159 public boolean isComment() {
163 public boolean isListItem() {
164 return listPrefixEnd != -1;
167 public String listItemPrefix() {
169 return wikiText.substring(start, listPrefixEnd);
172 public String listItemWikiText() {
174 return wikiText.substring(listPrefixEnd, end);
177 public boolean isFunction() {
181 public String functionName() {
184 if (firstUnescapedPipePos != -1) {
185 return wikiText.substring(start + 2, firstUnescapedPipePos);
187 return wikiText.substring(start + 2, end - 2);
190 public List<String> functionPositionArgs() {
194 public Map<String, String> functionNamedArgs() {
198 public boolean isPlainText() {
202 public boolean isWikiLink() {
206 public String wikiLinkText() {
209 if (lastUnescapedPipePos != -1) {
210 return wikiText.substring(lastUnescapedPipePos + 1, end - 2);
212 assert start + 2 < wikiText.length() && end >= 2: wikiText;
213 return wikiText.substring(start + 2, end - 2);
216 public String wikiLinkDest() {
219 if (firstUnescapedPipePos != -1) {
220 return wikiText.substring(start + 2, firstUnescapedPipePos);
225 public boolean remainderStartsWith(final String prefix) {
226 return wikiText.startsWith(prefix, start);
229 public void nextLine() {
230 final int oldStart = start;
231 while(nextToken() != null && !isNewline()) {}
239 public WikiTokenizer nextToken() {
244 if (justReturnedNewline) {
245 lastLineStart = start;
250 final int len = wikiText.length();
255 // Eat a newline if we're looking at one:
256 final boolean atNewline = wikiText.charAt(end) == '\n' || wikiText.charAt(end) == '\u2028';
258 justReturnedNewline = true;
263 if (justReturnedNewline) {
264 justReturnedNewline = false;
266 final char firstChar = wikiText.charAt(end);
267 if (firstChar == '=') {
268 final int headerStart = end;
270 while (++end < len && wikiText.charAt(end) == '=') {}
271 final int headerTitleStart = end;
272 headingDepth = headerTitleStart - headerStart;
275 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
276 final int closingEquals = escapedFindEnd(end, "=");
277 if (wikiText.charAt(closingEquals - 1) == '=') {
278 end = closingEquals - 1;
283 final int headerTitleEnd = end;
284 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
286 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
287 final int headerEnd = end;
288 if (headerEnd - headerTitleEnd != headingDepth) {
289 errors.add("Mismatched header depth: " + token());
293 if (listChars.indexOf(firstChar) != -1) {
294 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
296 end = escapedFindEnd(start, "\n");
301 if (wikiText.startsWith("'''", start)) {
307 if (wikiText.startsWith("''", start)) {
313 if (wikiText.startsWith("[[", start)) {
314 end = escapedFindEnd(start + 2, "]]");
315 isWikiLink = errors.isEmpty();
319 if (wikiText.startsWith("{{", start)) {
320 end = escapedFindEnd(start + 2, "}}");
321 isFunction = errors.isEmpty();
325 if (wikiText.startsWith("<pre>", start)) {
326 end = safeIndexOf(wikiText, start, "</pre>", "\n");
330 if (wikiText.startsWith("<math>", start)) {
331 end = safeIndexOf(wikiText, start, "</math>", "\n");
335 if (wikiText.startsWith("<!--", start)) {
337 end = safeIndexOf(wikiText, start, "-->", "\n");
341 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
342 errors.add("Close without open!");
347 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
354 if (this.matcher.find(start)) {
355 end = this.matcher.start(1);
358 errors.add("Empty group: " + this.matcher.group());
364 end = wikiText.length();
368 if (!errors.isEmpty()) {
369 System.err.println("Errors: " + errors + ", token=" + token());
375 public String token() {
376 final String token = wikiText.substring(start, end);
377 assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
381 private int escapedFindEnd(final int start, final String toFind) {
382 assert tokenStack.isEmpty();
384 final boolean insideFunction = toFind.equals("}}");
387 int firstNewline = -1;
388 while (end < wikiText.length()) {
389 if (matcher.find(end)) {
390 final String matchText = matcher.group();
391 final int matchStart = matcher.start();
393 assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
394 if (matchText.length() == 0) {
395 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
396 if (firstNewline == -1) {
397 firstNewline = matcher.end();
399 if (tokenStack.isEmpty() && toFind.equals("\n")) {
403 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
404 // The normal return....
405 if (insideFunction) {
406 addFunctionArg(insideFunction, matchStart);
408 return matcher.end();
409 } else if (matchText.equals("[[") || matchText.equals("{{")) {
410 tokenStack.add(matchText);
411 } else if (matchText.equals("]]") || matchText.equals("}}")) {
412 if (tokenStack.size() > 0) {
413 final String removed = tokenStack.remove(tokenStack.size() - 1);
414 if (removed.equals("{{") && !matcher.group().equals("}}")) {
415 errors.add("Unmatched {{ error: " + wikiText.substring(start));
416 return safeIndexOf(wikiText, start, "\n", "\n");
417 } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
418 errors.add("Unmatched [[ error: " + wikiText.substring(start));
419 return safeIndexOf(wikiText, start, "\n", "\n");
422 errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n"));
423 // If we were looking for a newline
424 return safeIndexOf(wikiText, start, "\n", "\n");
426 } else if (matchText.equals("|")) {
427 if (tokenStack.isEmpty()) {
428 addFunctionArg(insideFunction, matchStart);
430 } else if (matchText.equals("=")) {
431 if (tokenStack.isEmpty()) {
432 lastUnescapedEqualsPos = matchStart;
434 // Do nothing. These can match spuriously, and if it's not the thing
435 // we're looking for, keep on going.
436 } else if (matchText.equals("<!--")) {
437 end = wikiText.indexOf("-->");
439 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
440 return safeIndexOf(wikiText, start, "\n", "\n");
442 } else if (matchText.equals("''")) {
445 assert false : "Match text='" + matchText + "'";
446 throw new IllegalStateException();
449 // Hmmm, we didn't find the closing symbol we were looking for...
450 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
451 return safeIndexOf(wikiText, start, "\n", "\n");
454 // Inside the while loop. Just go forward.
455 end = Math.max(end, matcher.end());
457 if (toFind.equals("\n") && tokenStack.isEmpty()) {
458 // We were looking for the end, we got it.
461 if (firstNewline != -1) {
462 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
468 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
469 if (firstUnescapedPipePos == -1) {
470 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
471 } else if (insideFunction) {
472 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
473 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
474 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
475 namedArgs.put(key, value);
477 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
478 positionArgs.add(value);
481 lastUnescapedPipePos = matchStart;
484 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
485 int close = s.indexOf(target, start);
487 // Don't step over a \n.
488 return close + (target.equals("\n") ? 0 : target.length());
490 close = s.indexOf(backup, start);
492 return close + (backup.equals("\n") ? 0 : backup.length());
497 public static String toPlainText(final String wikiText) {
498 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
499 final StringBuilder builder = new StringBuilder();
500 while (wikiTokenizer.nextToken() != null) {
501 if (wikiTokenizer.isPlainText()) {
502 builder.append(wikiTokenizer.token());
503 } else if (wikiTokenizer.isWikiLink()) {
504 builder.append(wikiTokenizer.wikiLinkText());
505 } else if (wikiTokenizer.isNewline()) {
506 builder.append("\n");
507 } else if (wikiTokenizer.isFunction()) {
508 builder.append(wikiTokenizer.token());
511 return builder.toString();
514 public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
515 final Map<String, String> namedArgs) {
516 builder.append(name);
517 for (final String arg : args) {
518 builder.append("|").append(arg);
520 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
521 builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());