1 package com.hughes.android.dictionary.parser;
3 import java.util.ArrayList;
4 import java.util.LinkedHashMap;
7 import java.util.regex.Matcher;
8 import java.util.regex.Pattern;
10 public final class WikiTokenizer {
12 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
13 private static final Pattern wikiTokenEvent = Pattern.compile("(" +
16 "\\||" + // Need the | because we might have to find unescaped pipes
17 "=|" + // Need the = because we might have to find unescaped =
20 "$)", Pattern.MULTILINE);
21 private static final String listChars = "*#:;";
24 final String wikiText;
25 final Matcher matcher;
27 boolean justReturnedNewline = true;
28 int lastLineStart = 0;
32 final List<String> errors = new ArrayList<String>();
33 final List<String> tokenStack = new ArrayList<String>();
36 private String headingWikiText;
37 private int headingDepth;
38 private int listPrefixEnd;
39 private boolean isPlainText;
40 private boolean isMarkup;
41 private boolean isComment;
42 private boolean isFunction;
43 private boolean isWikiLink;
44 private int firstUnescapedPipePos;
46 private int lastUnescapedPipePos;
47 private int lastUnescapedEqualsPos;
48 private final List<String> positionArgs = new ArrayList<String>();
49 private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
52 public WikiTokenizer(final String wikiText) {
53 this.wikiText = wikiText;
54 this.matcher = wikiTokenEvent.matcher(wikiText);
57 private void clear() {
61 headingWikiText = null;
70 firstUnescapedPipePos = -1;
71 lastUnescapedPipePos = -1;
72 lastUnescapedEqualsPos = -1;
77 public boolean isNewline() {
78 return justReturnedNewline;
81 public void returnToLineStart() {
82 end = start = lastLineStart;
83 justReturnedNewline = true;
86 public boolean isHeading() {
87 return headingWikiText != null;
90 public String headingWikiText() {
92 return headingWikiText;
95 public int headingDepth() {
100 public boolean isMarkup() {
104 public boolean isComment() {
108 public boolean isListItem() {
109 return listPrefixEnd != -1;
112 public String listItemPrefix() {
114 return wikiText.substring(start, listPrefixEnd);
117 public String listItemWikiText() {
119 return wikiText.substring(listPrefixEnd, end);
122 public boolean isFunction() {
126 public String functionName() {
129 if (firstUnescapedPipePos != -1) {
130 return wikiText.substring(start + 2, firstUnescapedPipePos);
132 return wikiText.substring(start + 2, end - 2);
135 public List<String> functionPositionArgs() {
139 public Map<String, String> functionNamedArgs() {
143 public boolean isPlainText() {
147 public boolean isWikiLink() {
151 public String wikiLinkText() {
154 if (lastUnescapedPipePos != -1) {
155 return wikiText.substring(lastUnescapedPipePos + 1, end - 2);
157 return wikiText.substring(start + 2, end - 2);
160 public String wikiLinkDest() {
163 if (firstUnescapedPipePos != -1) {
164 return wikiText.substring(start + 2, firstUnescapedPipePos);
169 public boolean remainderStartsWith(final String prefix) {
170 return wikiText.startsWith(prefix, start);
173 public void nextLine() {
174 final int oldStart = start;
175 while(nextToken() != null && !isNewline()) {}
183 public WikiTokenizer nextToken() {
188 if (justReturnedNewline) {
189 lastLineStart = start;
194 final int len = wikiText.length();
199 // Eat a newline if we're looking at one:
200 final boolean atNewline = wikiText.charAt(end) == '\n';
202 justReturnedNewline = true;
207 if (justReturnedNewline) {
208 justReturnedNewline = false;
210 final char firstChar = wikiText.charAt(end);
211 if (firstChar == '=') {
212 final int headerStart = end;
214 while (++end < len && wikiText.charAt(end) == '=') {}
215 final int headerTitleStart = end;
216 headingDepth = headerTitleStart - headerStart;
219 final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
220 final int closingEquals = escapedFindEnd(end, "=");
221 if (wikiText.charAt(closingEquals - 1) == '=') {
222 end = closingEquals - 1;
227 final int headerTitleEnd = end;
228 headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
230 while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
231 final int headerEnd = end;
232 if (headerEnd - headerTitleEnd != headingDepth) {
233 errors.add("Mismatched header depth: " + token());
237 if (listChars.indexOf(firstChar) != -1) {
238 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
240 end = escapedFindEnd(start, "\n");
245 if (wikiText.startsWith("'''", start)) {
251 if (wikiText.startsWith("''", start)) {
257 if (wikiText.startsWith("[[", start)) {
258 end = escapedFindEnd(start + 2, "]]");
259 isWikiLink = errors.isEmpty();
263 if (wikiText.startsWith("{{", start)) {
264 end = escapedFindEnd(start + 2, "}}");
265 isFunction = errors.isEmpty();
269 if (wikiText.startsWith("<pre>", start)) {
270 end = safeIndexOf(wikiText, start, "</pre>", "\n");
274 if (wikiText.startsWith("<math>", start)) {
275 end = safeIndexOf(wikiText, start, "</math>", "\n");
279 if (wikiText.startsWith("<!--", start)) {
281 end = safeIndexOf(wikiText, start, "-->", "\n");
285 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
286 errors.add("Close without open!");
291 if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
298 if (this.matcher.find(start)) {
299 end = this.matcher.start(1);
302 errors.add("Empty group: " + this.matcher.group());
308 end = wikiText.length();
312 if (!errors.isEmpty()) {
313 System.err.println("Errors: " + errors + ", token=" + token());
319 public String token() {
320 final String token = wikiText.substring(start, end);
321 assert token.equals("\n") || !token.endsWith("\n") : token;
325 private int escapedFindEnd(final int start, final String toFind) {
326 assert tokenStack.isEmpty();
328 final boolean insideFunction = toFind.equals("}}");
331 while (end < wikiText.length()) {
332 if (matcher.find(end)) {
333 final String matchText = matcher.group();
334 final int matchStart = matcher.start();
336 assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
337 if (matchText.length() == 0) {
338 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
339 if (tokenStack.isEmpty() && toFind.equals("\n")) {
343 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
344 // The normal return....
345 if (insideFunction) {
346 addFunctionArg(insideFunction, matchStart);
348 return matcher.end();
349 } else if (matchText.equals("[[") || matchText.equals("{{")) {
350 tokenStack.add(matchText);
351 } else if (matchText.equals("]]") || matchText.equals("}}")) {
352 if (tokenStack.size() > 0) {
353 final String removed = tokenStack.remove(tokenStack.size() - 1);
354 if (removed.equals("{{") && !matcher.group().equals("}}")) {
355 errors.add("Unmatched {{ error: " + wikiText.substring(start));
356 return safeIndexOf(wikiText, start, "\n", "\n");
357 } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
358 errors.add("Unmatched [[ error: " + wikiText.substring(start));
359 return safeIndexOf(wikiText, start, "\n", "\n");
362 errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n"));
363 // If we were looking for a newline
364 return safeIndexOf(wikiText, start, "\n", "\n");
366 } else if (matchText.equals("|")) {
367 if (tokenStack.isEmpty()) {
368 addFunctionArg(insideFunction, matchStart);
370 } else if (matchText.equals("=")) {
371 if (tokenStack.isEmpty()) {
372 lastUnescapedEqualsPos = matchStart;
374 // Do nothing. These can match spuriously, and if it's not the thing
375 // we're looking for, keep on going.
376 } else if (matchText.equals("<!--")) {
377 end = wikiText.indexOf("-->");
379 errors.add("Unmatched <!-- error: " + wikiText.substring(start));
380 return safeIndexOf(wikiText, start, "\n", "\n");
382 } else if (matchText.equals("''")) {
385 assert false : "Match text='" + matchText + "'";
386 throw new IllegalStateException();
389 // Hmmm, we didn't find the closing symbol we were looking for...
390 errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
391 return safeIndexOf(wikiText, start, "\n", "\n");
394 // Inside the while loop. Just go forward.
395 end = Math.max(end, matcher.end());
400 private void addFunctionArg(final boolean insideFunction, final int matchStart) {
401 if (firstUnescapedPipePos == -1) {
402 firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
403 } else if (insideFunction) {
404 if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
405 final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
406 final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
407 namedArgs.put(key, value);
409 final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
410 positionArgs.add(value);
413 lastUnescapedPipePos = matchStart;
416 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
417 int close = s.indexOf(target, start);
419 // Don't step over a \n.
420 return close + (target.equals("\n") ? 0 : target.length());
422 close = s.indexOf(backup, start);
424 return close + (backup.equals("\n") ? 0 : backup.length());
429 public static String toPlainText(String sense) {
430 final WikiTokenizer wikiTokenizer = new WikiTokenizer(sense);
431 final StringBuilder builder = new StringBuilder();
432 while (wikiTokenizer.nextToken() != null) {
433 if (wikiTokenizer.isPlainText()) {
434 builder.append(wikiTokenizer.token());
435 } else if (wikiTokenizer.isWikiLink()) {
436 builder.append(wikiTokenizer.wikiLinkText());
437 } else if (wikiTokenizer.isNewline()) {
438 builder.append("\n");
439 } else if (wikiTokenizer.isFunction()) {
440 builder.append(wikiTokenizer.token());
443 return builder.toString();