1 package com.hughes.android.dictionary.parser;
3 import java.util.ArrayList;
5 import java.util.regex.Matcher;
6 import java.util.regex.Pattern;
8 public final class WikiTokenizer {
10 //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
11 private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|<!--|''|$)", Pattern.MULTILINE);
12 private static final String listChars = "*#:;";
15 final String wikiText;
16 final Matcher matcher;
18 boolean justReturnedNewline = true;
23 public int headerDepth;
25 final List<String> tokenStack = new ArrayList<String>();
27 public WikiTokenizer(final String wikiText) {
28 this.wikiText = wikiText;
29 this.matcher = wikiTokenEvent.matcher(wikiText);
32 private void clear() {
39 public WikiTokenizer nextToken() {
44 final int len = wikiText.length();
49 // Eat a newline if we're looking at one:
50 final boolean atNewline = wikiText.charAt(end) == '\n';
52 justReturnedNewline = true;
57 if (justReturnedNewline) {
58 final char firstChar = wikiText.charAt(end);
59 if (firstChar == '=') {
60 final int headerStart = end;
61 while (++end < len && wikiText.charAt(end) == '=') {}
62 final int headerTitleStart = end;
63 while (++end < len && wikiText.charAt(end) != '=' && wikiText.charAt(end) != '\n') {}
64 final int headerTitleEnd = end;
65 while (++end < len && wikiText.charAt(end) == '=') {}
66 final int headerEnd = end;
70 if (listChars.indexOf(firstChar) != -1) {
71 while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
72 end = escapedFind(start, "\n");
76 justReturnedNewline = false;
78 if (wikiText.startsWith("'''", start)) {
83 if (wikiText.startsWith("''", start)) {
88 if (wikiText.startsWith("[[", start)) {
89 end = escapedFind(start + 2, "]]");
93 if (wikiText.startsWith("{{", start)) {
94 end = escapedFind(start + 2, "}}");
98 if (wikiText.startsWith("<pre>", start)) {
99 end = safeIndexOf(wikiText, start, "</pre>", "\n");
103 if (wikiText.startsWith("<math>", start)) {
104 end = safeIndexOf(wikiText, start, "</math>", "\n");
108 if (wikiText.startsWith("<!--", start)) {
109 end = safeIndexOf(wikiText, start, "-->", "\n");
113 if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
114 System.err.println("Close without open!");
120 if (this.matcher.find(start)) {
121 end = this.matcher.start(1);
123 System.err.println(this.matcher.group());
129 end = wikiText.length();
134 public String token() {
135 return wikiText.substring(start, end);
138 private int escapedFind(final int start, final String toFind) {
139 assert tokenStack.isEmpty();
142 while (end < wikiText.length()) {
143 if (matcher.find(end)) {
144 final String matchText = matcher.group();
145 final int matchStart = matcher.start();
147 if (matchText.length() == 0) {
148 assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
149 if (tokenStack.isEmpty() && toFind.equals("\n")) {
153 } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
154 // The normal return....
155 return matcher.end();
156 } else if (matchText.equals("[[") || matchText.equals("{{")) {
157 tokenStack.add(matchText);
158 } else if (matchText.equals("]]") || matchText.equals("}}")) {
159 if (tokenStack.size() > 0) {
160 final String removed = tokenStack.remove(tokenStack.size() - 1);
161 if (removed.equals("{{") && !matcher.group().equals("}}")) {
162 System.err.println("Unmatched {{ error: " + wikiText.substring(start));
163 return safeIndexOf(wikiText, start, "\n", "\n");
164 } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
165 System.err.println("Unmatched [[ error: " + wikiText.substring(start));
166 return safeIndexOf(wikiText, start, "\n", "\n");
169 System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n"));
170 // If we were looking for a newline
171 return safeIndexOf(wikiText, start, "\n", "\n");
173 } else if (matchText.equals("<!--")) {
174 end = wikiText.indexOf("-->");
176 System.err.println("Unmatched <!-- error: " + wikiText.substring(start));
179 assert false : "Match text='" + matchText + "'";
180 throw new IllegalStateException();
183 // Hmmm, we didn't find the closing symbol we were looking for...
184 System.err.println("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
185 return safeIndexOf(wikiText, start, "\n", "\n");
188 // Inside the while loop.
189 end = Math.max(end, matcher.end());
194 static int safeIndexOf(final String s, final int start, final String target, final String backup) {
195 int close = s.indexOf(target, start);
197 return close + target.length();
199 close = s.indexOf(backup, start);
201 return close + backup.length();