1 // Copyright 2011 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser;
17 import java.util.ArrayList;
18 import java.util.Arrays;
19 import java.util.Collections;
20 import java.util.List;
22 import junit.framework.TestCase;
24 public class WikiTokenizerTest extends TestCase {
26 public void testWikiLink() {
30 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
31 assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
32 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
33 assertNull(new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
35 wikiText = "[[abc|def]]";
36 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
37 assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
38 assertEquals("def", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
39 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
41 wikiText = "[[abc|def|ghi{{a|=2}}p]]";
42 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
43 assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
44 assertEquals("ghi{{a|=2}}p", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
45 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
47 wikiText = "[[abc]][[def]]";
48 assertEquals("[[abc]]", new WikiTokenizer(wikiText).nextToken().token());
49 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
50 assertEquals("def", new WikiTokenizer(wikiText).nextToken().nextToken().wikiLinkText());
54 public void testWikiList() {
57 wikiText = "* This is ''bold''' asdf.";
58 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
60 wikiText = "* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}\nasdf\n";
61 assertEquals("* {{a|US}} {{IPA|[ˈfɔɹ.wɝd]]}}", new WikiTokenizer(wikiText).nextToken().token());
62 assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
63 assertEquals("\n", new WikiTokenizer(wikiText).nextToken().nextToken().token());
66 wikiText = "* [[asdf|\u2028" +
68 assertEquals("* [[asdf|\n" +
69 "asdf]]", new WikiTokenizer(wikiText).nextToken().token());
70 assertTrue(new WikiTokenizer(wikiText).nextToken().isListItem());
74 public void testFunction() {
78 WikiTokenizer wt = new WikiTokenizer("'''Προστατευόμενη Ονομασία Προέλευσης''', \"Protected Designation of Origin\" {{");
79 while (wt.nextToken() != null) {
80 if (wt.isFunction()) {
81 assertEquals("", wt.functionName());
87 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
88 assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
89 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
90 assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionPositionArgs().size());
91 assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
93 wikiText = "{{abc|def}}";
94 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
95 assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
96 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
97 assertEquals(Collections.singletonList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
98 assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
100 wikiText = "{{abc|d[[|]]ef|ghi}}";
101 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
102 assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
103 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
104 assertEquals(Arrays.asList("d[[|]]ef", "ghi"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
105 assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
107 wikiText = "{{abc|arg1=101|ghi|arg2=202|arg3={{n1|n2=7|n3}}|{{d}}}}";
108 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
109 assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
110 assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
111 assertEquals(Arrays.asList("ghi", "{{d}}"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
112 assertEquals(3, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
113 assertEquals("101", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg1"));
114 assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2"));
115 assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3"));
117 wikiText = "{{gloss|asdf}\nAsdf\n\n";
118 assertEquals("{{gloss|asdf}", new WikiTokenizer(wikiText).nextToken().token());
120 wikiText = "#*{{quote-book|year=1960|author={{w|P. G. Wodehouse}}\n" +
121 "|title={{w|Jeeves in the Offing}}\n" +
122 "|section=chapter XI\n" +
123 "|passage=“I'm sorely beset, Jeeves. Do you recall telling me once about someone who told somebody he could tell him something which would make him think a bit? Knitted socks and porcu\n" +
124 "pines entered into it, I remember.” “I think you may be referring to the ghost of the father of Hamlet, Prince of Denmark, sir. Addressing his son, he said ‘I could a tale unfold whos\n" +
125 "e lightest word would harrow up thy soul, freeze thy young blood, make thy two eyes, like stars, start from their spheres, thy knotted and combined locks to part and each particular h\n" +
126 "air to stand on end like quills upon the fretful '''porpentine'''.’ ” “That's right. Locks, of course, not socks. Odd that he should have said '''porpentine''' when he meant porc\n" +
127 "upine. Slip of the tongue, no doubt, as so often happens with ghosts.”}}";
128 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
133 public void testReturn() {
136 wikiText = "hello\n=Heading=\nhello2";
138 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
140 assertEquals("hello", tokenizer.nextToken().token());
141 tokenizer.returnToLineStart();
142 assertEquals("hello", tokenizer.nextToken().token());
143 assertEquals("\n", tokenizer.nextToken().token());
144 tokenizer.returnToLineStart();
145 assertEquals("hello", tokenizer.nextToken().token());
146 assertEquals("\n", tokenizer.nextToken().token());
148 assertEquals("=Heading=", tokenizer.nextToken().token());
149 tokenizer.returnToLineStart();
150 assertEquals("=Heading=", tokenizer.nextToken().token());
151 assertEquals("\n", tokenizer.nextToken().token());
152 tokenizer.returnToLineStart();
153 assertEquals("=Heading=", tokenizer.nextToken().token());
154 assertEquals("\n", tokenizer.nextToken().token());
156 assertEquals("hello2", tokenizer.nextToken().token());
157 assertNull(tokenizer.nextToken());
158 tokenizer.returnToLineStart();
159 assertEquals("hello2", tokenizer.nextToken().token());
160 assertNull(tokenizer.nextToken());
165 public void testWikiHeading() {
169 assertEquals("==", new WikiTokenizer(wikiText).nextToken().token());
170 assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
171 assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
172 assertEquals("", new WikiTokenizer(wikiText).nextToken().headingWikiText());
173 assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
177 assertEquals("=a", new WikiTokenizer(wikiText).nextToken().token());
178 assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
179 assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
180 assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
181 assertEquals(2, new WikiTokenizer(wikiText).nextToken().errors.size());
184 assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token());
185 assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
186 assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
187 assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
188 assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
191 assertEquals("a", new WikiTokenizer(wikiText).nextToken().token());
192 assertFalse(new WikiTokenizer(wikiText).nextToken().isHeading());
195 assertEquals("=a=", new WikiTokenizer(wikiText).nextToken().token());
196 assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
197 assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
198 assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
199 assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
201 wikiText = "==aa[[|=]] {{|={{=}} }}==";
202 assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
203 assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
204 assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
205 assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText());
206 assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
212 public void testSimple() {
213 final String wikiText =
215 "Hello =thad| you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
217 "multi-line" + "\n" +
218 "# comment -->" + "\n" +
221 "{{template_not_in_list}}" + "\n" +
222 "# {{template_in_list}}" + "\n" +
223 "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list
224 ": but this is a list!" + "\n" +
225 "*:* and so is this :::" + "\n" +
226 "here's [[some blah|some]] wikitext." + "\n" +
227 "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
228 "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
229 "== Header 2 ==" + "\n" +
230 "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
231 "{{mismatched]]" + "\n" +
232 "[[mismatched}}" + "\n" +
233 "{extraterminated}}" + "\n" +
234 "[extraterminated]]" + "\n" +
235 "=== {{header-template}} ===" + "\n";
237 final String[] expectedTokens = {
259 "<!--\nmulti-line\n# comment -->",
264 "{{template_not_in_list}}",
266 "# {{template_in_list}}",
272 ": but this is a list!",
274 "*:* and so is this :::",
277 "[[some blah|some]]",
281 "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}",
282 " and some more text.",
286 "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
298 "=== {{header-template}} ===",
302 final List<String> actualTokens = new ArrayList<>();
304 final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
307 while ((token = wikiTokenizer.nextToken()) != null) {
308 actualTokens.add(token.token());
309 System.out.println("\"" + token.token().replace("\n", "\\n") + "\",");
310 assertEquals(expectedTokens[i++], token.token());
312 assertEquals(Arrays.asList(expectedTokens), actualTokens);
315 public void testHtml() {
319 wikiText = " zz <pre> asdf </pre> ZZ <math> 1234 </math> XX ";
320 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
321 assertEquals(" zz ", tokenizer.nextToken().token());
322 assertEquals("<pre> asdf </pre>", tokenizer.nextToken().token());
323 assertEquals(" ZZ ", tokenizer.nextToken().token());
324 assertEquals("<math> 1234 </math>", tokenizer.nextToken().token());
325 assertEquals(" XX ", tokenizer.nextToken().token());
328 wikiText = "\n<math> 1234 </math>";
329 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
330 assertEquals("<math> 1234 </math>", tokenizer.nextToken().nextToken().token());
334 wikiText = "# z'' is the '''free''' variable in \"<math>\\forall x\\exists y:xy=z</math>\".''";
335 final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
336 assertEquals(wikiText, tokenizer.nextToken().token());