1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
21 public final class WikiTokenizer {
23 public interface Callback {
24 void onPlainText(final String text);
25 void onMarkup(WikiTokenizer wikiTokenizer);
26 void onWikiLink(WikiTokenizer wikiTokenizer);
27 void onNewline(WikiTokenizer wikiTokenizer);
28 void onFunction(final WikiTokenizer tokenizer, String functionName, List<String> functionPositionArgs,
29 Map<String, String> functionNamedArgs);
30 void onHeading(WikiTokenizer wikiTokenizer);
31 void onListItem(WikiTokenizer wikiTokenizer);
32 void onComment(WikiTokenizer wikiTokenizer);
33 void onHtml(WikiTokenizer wikiTokenizer);
36 public static class DoNothingCallback implements Callback {
39 public void onPlainText(String text) {
43 public void onMarkup(WikiTokenizer wikiTokenizer) {
47 public void onWikiLink(WikiTokenizer wikiTokenizer) {
51 public void onNewline(WikiTokenizer wikiTokenizer) {
55 public void onFunction(WikiTokenizer tokenizer, String functionName,
56 List<String> functionPositionArgs, Map<String, String> functionNamedArgs) {
60 public void onHeading(WikiTokenizer wikiTokenizer) {
64 public void onListItem(WikiTokenizer wikiTokenizer) {
68 public void onComment(WikiTokenizer wikiTokenizer) {
72 public void onHtml(WikiTokenizer wikiTokenizer) {
76 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
77 private static final Pattern wikiTokenEvent = Pattern.compile(
80 "\\||" + // Need the | because we might have to find unescaped pipes
81 "=|" + // Need the = because we might have to find unescaped =
87 "\n", Pattern.MULTILINE);
88 private static final String listChars = "*#:;";
91 final String wikiText;
92 final Matcher matcher;
94 boolean justReturnedNewline = true;
95 int lastLineStart = 0;
99 final List<String> errors = new ArrayList<>();
100 final List<TokenDelim> tokenStack = new ArrayList<>();
103 private String headingWikiText;
104 private int headingDepth;
105 private int listPrefixEnd;
106 private boolean isPlainText;
107 private boolean isMarkup;
108 private boolean isComment;
109 private boolean isFunction;
110 private boolean isWikiLink;
111 private boolean isHtml;
112 private int firstUnescapedPipePos;
114 private int lastUnescapedPipePos;
115 private int lastUnescapedEqualsPos;
116 private final List<String> positionArgs = new ArrayList<>();
117 private final Map<String,String> namedArgs = new LinkedHashMap<>();
120 public WikiTokenizer(final String wikiText) {
121 this(wikiText, true);
124 public WikiTokenizer(String wikiText, final boolean isNewline) {
125 wikiText = wikiText.replace('\u2028', '\n');
126 wikiText = wikiText.replace('\u2029', '\n');
127 wikiText = wikiText.replace('\u0085', '\n');
128 this.wikiText = wikiText;
129 this.matcher = wikiTokenEvent.matcher(wikiText);
130 justReturnedNewline = isNewline;
133 private void clear() {
137 headingWikiText = null;
147 firstUnescapedPipePos = -1;
148 lastUnescapedPipePos = -1;
149 lastUnescapedEqualsPos = -1;
150 positionArgs.clear();
154 private static final Matcher POSSIBLE_WIKI_TEXT = Pattern.compile(
165 public static void dispatch(final String wikiText, final boolean isNewline, final Callback callback) {
166 // Statistical background, from EN-DE dictionary generation:
167 // out of 12083000 calls, 9697686 can be skipped via the test
168 // for ', \n and ((c - 0x3b) & 0xff9f) < 2 (which covers among others
170 // This increased to 10006466 checking for <, { and [ specifically,
171 // and is minimally faster overall.
172 // A even more precise one using regex and checking for {{, [[, <!--, '',
173 // <pre>, <math>, <ref> and \n increased that to 10032846.
174 // Regex thus seems far too costly for a measly increase from 80%/82% to 83% rejection rate
175 // However completely removing it changes output (likely a bug), so leave it in for now
176 // but at least run it only on the 18% not caught by the faster logic.
177 // Original runtime: 1m29.708s
178 // Optimized: 1m19.170s
179 // Regex removed: 1m20.314s (not statistically significant)
180 boolean matched = false;
181 for (int i = 0; i < wikiText.length(); i++) {
182 int c = wikiText.charAt(i);
183 if (c == '\'' || c == '\n' || c == '<' || c == '[' || c == '{') {
188 if (!matched || !POSSIBLE_WIKI_TEXT.reset(wikiText).find()) {
189 callback.onPlainText(wikiText);
191 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText, isNewline);
192 while (tokenizer.nextToken() != null) {
193 if (tokenizer.isPlainText()) {
194 callback.onPlainText(tokenizer.token());
195 } else if (tokenizer.isMarkup()) {
196 callback.onMarkup(tokenizer);
197 } else if (tokenizer.isWikiLink()) {
198 callback.onWikiLink(tokenizer);
199 } else if (tokenizer.isNewline()) {
200 callback.onNewline(tokenizer);
201 } else if (tokenizer.isFunction()) {
202 callback.onFunction(tokenizer, tokenizer.functionName(), tokenizer.functionPositionArgs(), tokenizer.functionNamedArgs());
203 } else if (tokenizer.isHeading()) {
204 callback.onHeading(tokenizer);
205 } else if (tokenizer.isListItem()) {
206 callback.onListItem(tokenizer);
207 } else if (tokenizer.isComment()) {
208 callback.onComment(tokenizer);
209 } else if (tokenizer.isHtml()) {
210 callback.onHtml(tokenizer);
211 } else if (!tokenizer.errors.isEmpty()) {
212 // Log was already printed....
214 throw new IllegalStateException("Unknown wiki state: " + tokenizer.token());
220 public List<String> errors() {
224 public boolean isNewline() {
225 return justReturnedNewline;
228 public void returnToLineStart() {
229 end = start = lastLineStart;
230 justReturnedNewline = true;
233 public boolean isHeading() {
234 return headingWikiText != null;
237 public String headingWikiText() {
239 return headingWikiText;
242 public int headingDepth() {
247 public boolean isMarkup() {
251 public boolean isComment() {
255 public boolean isListItem() {
256 return listPrefixEnd != -1;
259 public String listItemPrefix() {
261 return wikiText.substring(start, listPrefixEnd);
264 public static String getListTag(char c) {
271 public String listItemWikiText() {
273 return wikiText.substring(listPrefixEnd, end);
276 public boolean isFunction() {
280 public String functionName() {
283 if (firstUnescapedPipePos != -1) {
284 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos).trim());
286 final int safeEnd = Math.max(start + 2, end - 2);
287 return trimNewlines(wikiText.substring(start + 2, safeEnd).trim());
290 public List<String> functionPositionArgs() {
294 public Map<String, String> functionNamedArgs() {
298 public boolean isPlainText() {
302 public boolean isWikiLink() {
306 public String wikiLinkText() {
309 if (lastUnescapedPipePos != -1) {
310 return trimNewlines(wikiText.substring(lastUnescapedPipePos + 1, end - 2));
312 assert start + 2 < wikiText.length() && end >= 2: wikiText;
313 return trimNewlines(wikiText.substring(start + 2, end - 2));
316 public String wikiLinkDest() {
319 if (firstUnescapedPipePos != -1) {
320 return trimNewlines(wikiText.substring(start + 2, firstUnescapedPipePos));
325 public boolean isHtml() {
329 public boolean remainderStartsWith(final String prefix) {
330 return wikiText.startsWith(prefix, start);
333 public void nextLine() {
334 final int oldStart = start;
335 while(nextToken() != null && !isNewline()) {}
343 public WikiTokenizer nextToken() {
348 if (justReturnedNewline) {
349 lastLineStart = start;
354 final int len = wikiText.length();
359 // Eat a newline if we're looking at one:
360 final boolean atNewline = wikiText.charAt(end) == '\n';
362 justReturnedNewline = true;
367 if (justReturnedNewline) {
368 justReturnedNewline = false;
370 final char firstChar = wikiText.charAt(end);
371 if (firstChar == '=') {
372 final int headerStart = end;
374 while (++end < len && wikiText.charAt(end) == '=') {}
375 final int headerTitleStart = end;
376 headingDepth = headerTitleStart - headerStart;
379 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
380 final int closingEquals = escapedFindEnd(end, TokenDelim.EQUALS);
381 if (wikiText.charAt(closingEquals - 1) == '=') {
382 end = closingEquals - 1;
387 final int headerTitleEnd = end;
388 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
390 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
391 final int headerEnd = end;
392 if (headerEnd - headerTitleEnd != headingDepth) {
393 errors.add("Mismatched header depth: " + token());
397 if (listChars.indexOf(firstChar) != -1) {
398 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
400 end = escapedFindEnd(start, TokenDelim.NEWLINE);
405 if (wikiText.startsWith("'''", start)) {
411 if (wikiText.startsWith("''", start)) {
417 if (wikiText.startsWith("[[", start)) {
418 end = escapedFindEnd(start + 2, TokenDelim.DBRACKET_CLOSE);
419 isWikiLink = errors.isEmpty();
423 if (wikiText.startsWith("{{", start)) {
424 end = escapedFindEnd(start + 2, TokenDelim.BRACE_CLOSE);
425 isFunction = errors.isEmpty();
429 if (wikiText.startsWith("<pre>", start)) {
430 end = safeIndexOf(wikiText, start, "</pre>", "\n");
435 if (wikiText.startsWith("<ref>", start)) {
436 end = safeIndexOf(wikiText, start, "</ref>", "\n");
441 if (wikiText.startsWith("<math>", start)) {
442 end = safeIndexOf(wikiText, start, "</math>", "\n");
447 if (wikiText.startsWith("<!--", start)) {
449 end = safeIndexOf(wikiText, start, "-->", "\n");
453 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
454 errors.add("Close without open!");
459 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
466 while (end < wikiText.length()) {
467 int c = wikiText.charAt(end);
468 if (c == '\n' || c == '\'' || ((c - 0x1b) & 0xff9f) < 3) {
469 matcher.region(end, wikiText.length());
470 if (matcher.lookingAt()) break;
474 if (end != wikiText.length()) {
477 // stumbled over a new type of newline?
478 // Or matcher is out of sync with checks above
479 errors.add("Empty group: " + this.matcher.group() + " char: " + (int)wikiText.charAt(end));
481 // Note: all newlines should be normalize to \n before calling this function
482 throw new RuntimeException("matcher not in sync with code, or new type of newline, errors :" + errors);
491 if (!errors.isEmpty()) {
492 System.err.println("Errors: " + errors + ", token=" + token());
498 public String token() {
499 final String token = wikiText.substring(start, end);
500 assert token.equals("\n") || !token.endsWith("\n") : "token='" + token + "'";
504 enum TokenDelim { NEWLINE, BRACE_OPEN, BRACE_CLOSE, DBRACKET_OPEN, DBRACKET_CLOSE, BRACKET_OPEN, BRACKET_CLOSE, PIPE, EQUALS, COMMENT }
506 private int tokenDelimLen(TokenDelim d) {
522 throw new RuntimeException();
526 static final String[] patterns = { "\n", "{{", "}}", "[[", "]]", "[", "]", "|", "=", "<!--" };
527 private int escapedFindEnd(final int start, final TokenDelim toFind) {
528 assert tokenStack.isEmpty();
530 final boolean insideFunction = toFind == TokenDelim.BRACE_CLOSE;
533 int firstNewline = -1;
534 int singleBrackets = 0;
535 while (end < wikiText.length()) {
536 // Manual replacement for matcher.find(end),
537 // because Java regexp is a ridiculously slow implementation.
538 // Initialize to always match the end.
539 TokenDelim match = TokenDelim.NEWLINE;
540 int matchStart = end;
541 for (; matchStart < wikiText.length(); matchStart++) {
543 int c = wikiText.charAt(i);
544 if (c == '\n') break;
545 if (c == '{' && wikiText.startsWith("{{", i)) { match = TokenDelim.BRACE_OPEN; break; }
546 if (c == '}' && wikiText.startsWith("}}", i)) { match = TokenDelim.BRACE_CLOSE; break; }
547 if (c == '[') { match = wikiText.startsWith("[[", i) ? TokenDelim.DBRACKET_OPEN : TokenDelim.BRACKET_OPEN ; break; }
548 if (c == ']') { match = wikiText.startsWith("]]", i) ? TokenDelim.DBRACKET_CLOSE : TokenDelim.BRACKET_CLOSE ; break; }
549 if (c == '|') { match = TokenDelim.PIPE; break; }
550 if (c == '=') { match = TokenDelim.EQUALS; break; }
551 if (c == '<' && wikiText.startsWith("<!--", i)) { match = TokenDelim.COMMENT; break; }
554 int matchEnd = matchStart + (match == TokenDelim.NEWLINE ? 0 : tokenDelimLen(match));
555 if (match != TokenDelim.NEWLINE && tokenStack.isEmpty() && match == toFind) {
556 // The normal return....
557 if (insideFunction) {
558 addFunctionArg(insideFunction, matchStart);
564 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n' : wikiText + ", " + matchStart;
565 if (firstNewline == -1) {
566 firstNewline = matchEnd;
568 if (tokenStack.isEmpty() && toFind == TokenDelim.NEWLINE) {
577 if (singleBrackets > 0) singleBrackets--;
581 tokenStack.add(match);
585 if (!tokenStack.isEmpty()) {
586 final TokenDelim removed = tokenStack.remove(tokenStack.size() - 1);
587 if (removed == TokenDelim.BRACE_OPEN && match != TokenDelim.BRACE_CLOSE) {
588 if (singleBrackets >= 2) { // assume this is really two closing single ]
590 tokenStack.add(removed);
592 errors.add("Unmatched {{ error: " + wikiText.substring(start, matchEnd));
593 return safeIndexOf(wikiText, start, "\n", "\n");
595 } else if (removed == TokenDelim.DBRACKET_OPEN && match != TokenDelim.DBRACKET_CLOSE) {
596 errors.add("Unmatched [[ error: " + wikiText.substring(start, matchEnd));
597 return safeIndexOf(wikiText, start, "\n", "\n");
600 errors.add("Pop too many " + wikiText.substring(matchStart, matchEnd) + " error: " + wikiText.substring(start, matchEnd).replace("\n", "\\\\n"));
601 // If we were looking for a newline
602 return safeIndexOf(wikiText, start, "\n", "\n");
606 if (tokenStack.isEmpty()) {
607 addFunctionArg(insideFunction, matchStart);
611 if (tokenStack.isEmpty()) {
612 lastUnescapedEqualsPos = matchStart;
614 // Do nothing. These can match spuriously, and if it's not the thing
615 // we're looking for, keep on going.
618 end = wikiText.indexOf("-->", matchStart);
620 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
621 return safeIndexOf(wikiText, start, "\n", "\n");
625 throw new RuntimeException();
628 // Inside the while loop. Just go forward.
629 end = Math.max(end, matchEnd);
631 if (toFind == TokenDelim.NEWLINE && tokenStack.isEmpty()) {
632 // We were looking for the end, we got it.
635 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
636 if (firstNewline != -1) {
642 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
643 if (firstUnescapedPipePos == -1) {
644 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
645 } else if (insideFunction) {
646 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
647 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
648 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
649 namedArgs.put(trimNewlines(key), trimNewlines(value));
651 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
652 positionArgs.add(trimNewlines(value));
655 lastUnescapedPipePos = matchStart;
658 static String trimNewlines(String s) {
659 while (s.startsWith("\n")) {
662 while (s.endsWith("\n")) {
663 s = s.substring(0, s.length() - 1);
665 return s.replace('\n', ' ');
668 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
669 int close = s.indexOf(target, start);
671 // Don't step over a \n.
672 return close + (target.equals("\n") ? 0 : target.length());
674 close = s.indexOf(backup, start);
676 return close + (backup.equals("\n") ? 0 : backup.length());
681 public static String toPlainText(final String wikiText) {
682 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
683 final StringBuilder builder = new StringBuilder();
684 while (wikiTokenizer.nextToken() != null) {
685 if (wikiTokenizer.isPlainText()) {
686 builder.append(wikiTokenizer.token());
687 } else if (wikiTokenizer.isWikiLink()) {
688 builder.append(wikiTokenizer.wikiLinkText());
689 } else if (wikiTokenizer.isNewline()) {
690 builder.append("\n");
691 } else if (wikiTokenizer.isFunction()) {
692 builder.append(wikiTokenizer.token());
695 return builder.toString();
698 public static StringBuilder appendFunction(final StringBuilder builder, final String name, List<String> args,
699 final Map<String, String> namedArgs) {
700 builder.append(name);
701 for (final String arg : args) {
702 builder.append("|").append(arg);
704 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
705 builder.append("|").append(entry.getKey()).append("=").append(entry.getValue());