--- /dev/null
+dictInputs
+dictOutputs/
+bin
});
// Check it once:
- assertFilesEqual("testdata/wiktionary.it.golden", "testdata/wiktionary.it.test");
+ assertFilesEqual("testdata/wiktionary.it.golden2", "testdata/wiktionary.it.test");
// Check it again.
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
-import com.hughes.android.dictionary.engine.DictionaryBuilder;
import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
public class EnWiktionaryXmlParser {
+ // TODO: look for {{ and [[ and <adf> <!-- in output.
+ // TODO: process {{ttbc}} lines
+
static final Pattern partOfSpeechHeader = Pattern.compile(
"Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
"Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
"Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
"Particle|Interjection|Pronominal adverb" +
"Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
-
+
final IndexBuilder enIndexBuilder;
final IndexBuilder otherIndexBuilder;
final Pattern langPattern;
}
}
- private void parseSection(final String title, final String heading, final String text) {
+ private void parseSection(final String title, String heading, final String text) {
if (title.startsWith("Wiktionary:") ||
title.startsWith("Template:") ||
title.startsWith("Appendix:") ||
return;
}
- if (heading.replaceAll("=", "").equals("English")) {
+ heading = heading.replaceAll("=", "").trim();
+ if (heading.equals("English")) {
doEnglishWord(title, text);
- } else {
+ } else if (langPattern.matcher(heading).matches()){
doForeignWord(title, text);
}
int posDepth = -1;
private void doEnglishWord(String title, String text) {
- final WikiLineReader wikiLineReader = new WikiLineReader(text);
- String line;
- while ((line = wikiLineReader.readLine()) != null) {
- final WikiHeading wikiHeading = WikiHeading.getHeading(line);
- if (wikiHeading != null) {
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
+ while (wikiTokenizer.nextToken() != null) {
+
+ if (wikiTokenizer.isHeading()) {
+ final String headerName = wikiTokenizer.headingWikiText();
- if (wikiHeading.depth <= posDepth) {
+ if (wikiTokenizer.headingDepth() <= posDepth) {
pos = null;
posDepth = -1;
}
- if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
- posDepth = wikiHeading.depth;
- pos = wikiHeading.name;
- } else if (wikiHeading.name.equals("Translations")) {
- doTranslations(title, wikiLineReader);
- } else if (wikiHeading.name.equals("Pronunciation")) {
+ if (partOfSpeechHeader.matcher(headerName).matches()) {
+ posDepth = wikiTokenizer.headingDepth();
+ pos = wikiTokenizer.headingWikiText();
+ } else if (headerName.equals("Translations")) {
+ doTranslations(title, wikiTokenizer);
+ } else if (headerName.equals("Pronunciation")) {
//doPronunciation(wikiLineReader);
}
}
"Jpan", "Kore", "Hebr", "rfscript", "Beng", "Mong", "Knda", "Cyrs",
"yue-tsj", "Mlym", "Tfng", "Grek", "yue-yue-j"));
- private void doTranslations(final String title, final WikiLineReader wikiLineReader) {
- String line;
+ private void doTranslations(final String title, final WikiTokenizer wikiTokenizer) {
String sense = null;
boolean done = false;
- while ((line = wikiLineReader.readLine()) != null) {
- if (WikiHeading.getHeading(line) != null) {
- wikiLineReader.stuffLine(line);
+ while (wikiTokenizer.nextToken() != null) {
+ if (wikiTokenizer.isHeading()) {
+ wikiTokenizer.returnToLineStart();
return;
}
if (done) {
//line = WikiLineReader.removeSquareBrackets(line);
- if (line.startsWith("{{")) {
+ if (wikiTokenizer.isFunction()) {
+ final String functionName = wikiTokenizer.functionName();
+ final List<String> positionArgs = wikiTokenizer.functionPositionArgs();
- WikiFunction wikiFunction;
- while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
- if (wikiFunction.name.equals("trans-top")) {
- sense = null;
- if (wikiFunction.args.size() >= 1) {
- sense = wikiFunction.args.get(0);
- //System.out.println("Sense: " + sense);
- }
- } else if (wikiFunction.name.equals("trans-bottom")) {
- sense = null;
- } else if (wikiFunction.name.equals("trans-mid")) {
- } else if (wikiFunction.name.equals("trans-see")) {
- } else if (wikiFunction.name.startsWith("checktrans")) {
- done = true;
- } else {
- System.err.println("Unexpected translation wikifunction: " + line + ", title=" + title);
+ if (functionName.equals("trans-top")) {
+ sense = null;
+ if (wikiTokenizer.functionPositionArgs().size() >= 1) {
+ sense = positionArgs.get(0);
+ // TODO: could emphasize words in [[brackets]] inside sense.
+ sense = WikiTokenizer.toPlainText(sense);
+ //System.out.println("Sense: " + sense);
}
- line = wikiFunction.replaceWith(line, "");
-
+ } else if (functionName.equals("trans-bottom")) {
+ sense = null;
+ } else if (functionName.equals("trans-mid")) {
+ } else if (functionName.equals("trans-see")) {
+ } else if (functionName.startsWith("checktrans")) {
+ //TODO: Check this: done = true;
+ } else {
+ System.err.println("Unexpected translation wikifunction: " + wikiTokenizer.token() + ", title=" + title);
}
-
- } else if (line.startsWith("*")) {
+ } else if (wikiTokenizer.isListItem() && wikiTokenizer.listItemPrefix().startsWith("*")) {
+ final String line = wikiTokenizer.listItemWikiText();
// This line could produce an output...
// First strip the language and check whether it matches.
String rest = line.substring(colonIndex + 1).trim();
doTranslationLine(line, title, sense, rest);
- } else if (line.equals("")) {
- } else if (line.startsWith(":")) {
- } else if (line.startsWith("[[") && line.endsWith("]]")) {
- } else if (line.startsWith("''See''")) {
- } else if (line.startsWith("''")) {
- } else if (line.equals("----")) {
+ } else if (wikiTokenizer.remainderStartsWith("''See''")) {
+ wikiTokenizer.nextLine();
+ System.out.println("Skipping line: " + wikiTokenizer.token());
+ } else if (wikiTokenizer.isWikiLink()) {
+ final String wikiLink = wikiTokenizer.wikiLinkText();
+ if (wikiLink.contains(":") && wikiLink.contains(title)) {
+ } else if (wikiLink.contains("Category:")) {
+ } else {
+ System.err.println("Unexpected wikiLink: " + wikiTokenizer.token() + ", title=" + title);
+ }
+ } else if (wikiTokenizer.isNewline() || wikiTokenizer.isMarkup() || wikiTokenizer.isComment()) {
} else {
- System.err.println("Unexpected translation line: " + line + ", title=" + title);
+ final String token = wikiTokenizer.token();
+ if (token.equals("----")) {
+ } else {
+ System.err.println("Unexpected translation token: " + wikiTokenizer.token() + ", title=" + title);
+ }
}
}
-
}
- private void doTranslationLine(final String line, final String title, final String sense, String rest) {
-
+ private static <T> T get(final List<T> list, final int index) {
+ return index < list.size() ? list.get(index) : null;
+ }
+
+ private void doTranslationLine(final String line, final String title, final String sense, final String rest) {
// Good chance we'll actually file this one...
final PairEntry pairEntry = new PairEntry();
final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
final StringBuilder otherText = new StringBuilder();
-
- WikiFunction wikiFunction;
- while ((wikiFunction = WikiFunction.getFunction(rest)) != null) {
- if (wikiFunction.start > 0) {
- String plainText = rest.substring(0, wikiFunction.start);
- otherText.append("").append(plainText);
- otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
- }
- rest = rest.substring(wikiFunction.end);
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(rest);
+ while (wikiTokenizer.nextToken() != null) {
- if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
- if (wikiFunction.args.size() < 2) {
- System.err.println("{{t}} with too few args: " + line + ", title=" + title);
- continue;
- }
- final String langCode = wikiFunction.getArg(0);
- if (this.langCodePattern.matcher(langCode).matches()) {
- final String word = wikiFunction.getArg(1);
- final String gender = wikiFunction.getArg(2);
- final String transliteration = wikiFunction.getNamedArg("tr");
- if (otherText.length() > 0) {
- otherText.append("");
+ if (wikiTokenizer.isPlainText()) {
+ final String plainText = wikiTokenizer.token();
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+
+ } else if (wikiTokenizer.isWikiLink()) {
+ final String plainText = wikiTokenizer.wikiLinkText();
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_TRANSLATION_WIKI_TEXT);
+
+ } else if (wikiTokenizer.isFunction()) {
+ final String functionName = wikiTokenizer.functionName();
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
+
+ if (functionName.equals("t") || functionName.equals("t+") || functionName.equals("t-") || functionName.equals("tø")) {
+ if (args.size() < 2) {
+ System.err.println("{{t}} with too few args: " + line + ", title=" + title);
+ continue;
}
- otherText.append(word);
- otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
- if (gender != null) {
- otherText.append(String.format(" {%s}", gender));
+ final String langCode = get(args, 0);
+ if (this.langCodePattern.matcher(langCode).matches()) {
+ final String word = get(args, 1);
+ final String gender = get(args, 2);
+ final String transliteration = namedArgs.get("tr");
+ if (otherText.length() > 0) {
+ otherText.append("");
+ }
+ otherText.append(word);
+ otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ if (gender != null) {
+ otherText.append(String.format(" {%s}", gender));
+ }
+ if (transliteration != null) {
+ otherText.append(String.format(" (tr. %s)", transliteration));
+ otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+ }
}
- if (transliteration != null) {
- otherText.append(String.format(" (tr. %s)", transliteration));
- otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+ } else if (functionName.equals("qualifier")) {
+ String qualifier = args.get(0);
+ if (!namedArgs.isEmpty() || args.size() > 1) {
+ System.err.println("weird qualifier: " + line);
}
+ otherText.append("(").append(qualifier).append(")");
+ } else if (encodings.contains(functionName)) {
+ otherText.append("").append(args.get(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ } else if (functionName.equals("m") || functionName.equals("f") || functionName.equals("n") || functionName.equals("p")) {
+ otherText.append("{");
+ otherText.append(functionName);
+ for (int i = 0; i < args.size(); ++i) {
+ otherText.append("|").append(args.get(i));
+ }
+ otherText.append("}");
+ } else if (functionName.equals("g")) {
+ otherText.append("{g}");
+ } else if (functionName.equals("l")) {
+ // encodes text in various langs.
+ // lang is arg 0.
+ otherText.append("").append(args.get(1));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(1), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (functionName.equals("term")) {
+ // cross-reference to another dictionary
+ otherText.append("").append(args.get(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (functionName.equals("italbrac") || functionName.equals("gloss")) {
+ // TODO: put this text aside to use it.
+ otherText.append("[").append(args.get(0)).append("]");
+ otherIndexBuilder.addEntryWithString(indexedEntry, args.get(0), EntryTypeName.WIKTIONARY_TRANSLATION_OTHER_TEXT);
+ } else if (functionName.equals("ttbc")) {
+ } else if (functionName.equals("trreq")) {
+ } else if (functionName.equals("not used")) {
+ otherText.append("(not used)");
+ } else if (functionName.equals("t-image")) {
+ // American sign language
+ } else if (args.isEmpty() && namedArgs.isEmpty()) {
+ otherText.append("{UNK. FUNC.: ").append(functionName).append("}");
+ } else {
+ System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
}
- } else if (wikiFunction.name.equals("qualifier")) {
- String qualifier = wikiFunction.getArg(0);
- if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) {
- System.err.println("weird qualifier: " + line);
- }
- otherText.append("(").append(qualifier).append(")");
- } else if (encodings.contains(wikiFunction.name)) {
- otherText.append("").append(wikiFunction.getArg(0));
- otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
- } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
- otherText.append("{");
- otherText.append(wikiFunction.name);
- for (int i = 0; i < wikiFunction.args.size(); ++i) {
- otherText.append("|").append(wikiFunction.getArg(i));
- }
- otherText.append("}");
- } else if (wikiFunction.name.equals("g")) {
- otherText.append("{g}");
- } else if (wikiFunction.name.equals("l")) {
- // encodes text in various langs.
- // lang is arg 0.
- otherText.append("").append(wikiFunction.getArg(1));
- otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT);
- // TODO: transliteration
- } else if (wikiFunction.name.equals("term")) {
- // cross-reference to another dictionary
- otherText.append("").append(wikiFunction.getArg(0));
- otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
- // TODO: transliteration
- } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
- // TODO: put this text aside to use it.
- otherText.append("[").append(wikiFunction.getArg(0)).append("]");
- otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
- } else if (wikiFunction.name.equals("ttbc")) {
- } else if (wikiFunction.name.equals("trreq")) {
- } else if (wikiFunction.name.equals("not used")) {
- otherText.append("(not used)");
- } else if (wikiFunction.name.equals("t-image")) {
- // American sign language
- } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
- otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}");
+
+ } else if (wikiTokenizer.isNewline()) {
+ assert false;
+ } else if (wikiTokenizer.isComment()) {
+ } else if (wikiTokenizer.isMarkup()) {
} else {
- System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
+ System.err.println("Bad translation token: " + wikiTokenizer.token());
}
+
}
- String plainText = rest;
- otherText.append("").append(plainText);
- otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
StringBuilder englishText = new StringBuilder();
}
enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
- final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap);
+ final Pair pair = new Pair(trim(englishText.toString()), trim(otherText.toString()), swap);
pairEntry.pairs.add(pair);
assert (pairsAdded.add(pair.toString()));
if (pair.toString().equals("libero {m} :: free (adjective)")) {
}
+ static final Pattern whitespace = Pattern.compile("\\s+");
+
+ static String trim(final String s) {
+ return whitespace.matcher(s).replaceAll(" ").trim();
+ }
+
Set<String> pairsAdded = new LinkedHashSet<String>();
// -------------------------------------------------------------------------
- private void doForeignWord(String title, String text) {
- final WikiLineReader wikiLineReader = new WikiLineReader(text);
- String line;
- while ((line = wikiLineReader.readLine()) != null) {
- final WikiHeading wikiHeading = WikiHeading.getHeading(line);
- if (wikiHeading != null) {
- if (wikiHeading.name.equals("Translations")) {
+ private void doForeignWord(final String title, final String text) {
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(text);
+ while (wikiTokenizer.nextToken() != null) {
+ if (wikiTokenizer.isHeading()) {
+ final String headingName = wikiTokenizer.headingWikiText();
+ if (headingName.equals("Translations")) {
System.err.println("Translations not in English section: " + title);
- } else if (wikiHeading.name.equals("Pronunciation")) {
+ } else if (headingName.equals("Pronunciation")) {
//doPronunciation(wikiLineReader);
- } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
- doPartOfSpeech(title, wikiHeading, wikiLineReader);
+ } else if (partOfSpeechHeader.matcher(headingName).matches()) {
+ doPartOfSpeech(title, headingName, wikiTokenizer.headingDepth(), wikiTokenizer);
}
+ } else {
}
}
}
- private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) {
- String line;
+ private void doPartOfSpeech(String title, final String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) {
System.out.println("***" + title);
- System.out.println(posHeading.name);
- while ((line = wikiLineReader.readLine()) != null) {
- WikiHeading heading = WikiHeading.getHeading(line);
- if (heading != null) {
- if (heading.depth <= posHeading.depth) {
- wikiLineReader.stuffLine(line);
+ System.out.println(posHeading);
+ //final StringBuilder foreignBuilder = new StringBuilder();
+
+ String side = null;
+ Collection<String> forms = Collections.emptyList();
+
+ int currentHeadingDepth = posDepth;
+ while (wikiTokenizer.nextToken() != null) {
+ if (wikiTokenizer.isHeading()) {
+ currentHeadingDepth = wikiTokenizer.headingDepth();
+
+ if (currentHeadingDepth <= posDepth) {
+ wikiTokenizer.returnToLineStart();
return;
}
}
- System.out.println(line);
+ if (currentHeadingDepth > posDepth) {
+ // TODO
+ continue;
+ }
+ if (wikiTokenizer.isFunction()) {
+ final String name = wikiTokenizer.functionName();
+ final List<String> args = wikiTokenizer.functionPositionArgs();
+ final Map<String,String> namedArgs = wikiTokenizer.functionNamedArgs();
+ // First line is generally a repeat of the title with some extra information.
+ // We need to build up the left side (foreign text, tokens) separately from the
+ // right side (English). The left-side may get paired with multiple right sides.
+ // The left side should get filed under every form of the word in question (singular, plural).
+
+ // For verbs, the conjugation comes later on in a deeper section.
+ // Ideally, we'd want to file every English entry with the verb
+ // under every verb form coming from the conjugation.
+ // Ie. under "fa": see: "make :: fare" and "do :: fare"
+ // But then where should we put the conjugation table?
+ // I think just under fare. But then we need a way to link to the entry (actually the row, since entries doesn't show up!)
+ // for the conjugation table from "fa".
+ // Would like to be able to link to a lang#token.
+ if (name.equals("it-noun")) {
+ assert forms.isEmpty();
+ final String base = get(args, 0);
+ final String gender = get(args, 1);
+ final String singular = base + get(args, 2);
+ final String plural = base + get(args, 3);
+ side = String.format("%s {%s}, %s {pl}", singular, gender, plural, plural);
+ forms = Arrays.asList(singular, plural);
+ } else if (name.equals("it-proper noun")) {
+ // TODO
+ } else if (name.equals("it-adj")) {
+ // TODO
+ } else if (name.startsWith("it-conj")) {
+ if (name.equals("it-conj-are")) {
+ itConjAre(args, namedArgs);
+ } else if (name.equals("it-conj-ere")) {
+ } else if (name.equals("it-conj-ire")) {
+ } else {
+ System.err.println("Unknown conjugation: " + wikiTokenizer.token());
+ }
+
+ } else {
+ System.err.println("Unknown function: " + wikiTokenizer.token());
+ }
+
+ } else if (wikiTokenizer.isListItem()) {
+ handleForeignListItem(side != null ? side : title, title, forms, wikiTokenizer);
+
+ } else if (wikiTokenizer.isWikiLink()) {
+
+ } else {
+ }
+
+ }
+ }
+
+ private void itConjAre(List<String> args, Map<String, String> namedArgs) {
+ final String base = args.get(0);
+ final String aux = args.get(1);
+
+ putIfMissing(namedArgs, "inf", base + "are");
+ putIfMissing(namedArgs, "aux", aux);
+ putIfMissing(namedArgs, "ger", base + "ando");
+ putIfMissing(namedArgs, "presp", base + "ante");
+ putIfMissing(namedArgs, "pastp", base + "ato");
+ // Present
+ putIfMissing(namedArgs, "pres1s", base + "o");
+ putIfMissing(namedArgs, "pres2s", base + "i");
+ putIfMissing(namedArgs, "pres3s", base + "a");
+ putIfMissing(namedArgs, "pres1p", base + "iamo");
+ putIfMissing(namedArgs, "pres2p", base + "ate");
+ putIfMissing(namedArgs, "pres3p", base + "ano");
+ // Imperfect
+ putIfMissing(namedArgs, "imperf1s", base + "avo");
+ putIfMissing(namedArgs, "imperf2s", base + "avi");
+ putIfMissing(namedArgs, "imperf3s", base + "ava");
+ putIfMissing(namedArgs, "imperf1p", base + "avamo");
+ putIfMissing(namedArgs, "imperf2p", base + "avate");
+ putIfMissing(namedArgs, "imperf3p", base + "avano");
+ // Passato remoto
+ putIfMissing(namedArgs, "prem1s", base + "ai");
+ putIfMissing(namedArgs, "prem2s", base + "asti");
+ putIfMissing(namedArgs, "prem3s", base + "ò");
+ putIfMissing(namedArgs, "prem1p", base + "ammo");
+ putIfMissing(namedArgs, "prem2p", base + "aste");
+ putIfMissing(namedArgs, "prem3p", base + "arono");
+ // Future
+ putIfMissing(namedArgs, "fut1s", base + "erò");
+ putIfMissing(namedArgs, "fut2s", base + "erai");
+ putIfMissing(namedArgs, "fut3s", base + "erà");
+ putIfMissing(namedArgs, "fut1p", base + "eremo");
+ putIfMissing(namedArgs, "fut2p", base + "erete");
+ putIfMissing(namedArgs, "fut3p", base + "eranno");
+ // Conditional
+ putIfMissing(namedArgs, "cond1s", base + "erei");
+ putIfMissing(namedArgs, "cond2s", base + "eresti");
+ putIfMissing(namedArgs, "cond3s", base + "erebbe");
+ putIfMissing(namedArgs, "cond1p", base + "eremmo");
+ putIfMissing(namedArgs, "cond2p", base + "ereste");
+ putIfMissing(namedArgs, "cond3p", base + "erebbero");
+ // Subjunctive / congiuntivo
+ putIfMissing(namedArgs, "sub123s", base + "i");
+ putIfMissing(namedArgs, "sub1p", base + "iamo");
+ putIfMissing(namedArgs, "sub2p", base + "iate");
+ putIfMissing(namedArgs, "sub3p", base + "ino");
+ // Imperfect subjunctive
+ putIfMissing(namedArgs, "impsub12s", base + "assi");
+ putIfMissing(namedArgs, "impsub3s", base + "asse");
+ putIfMissing(namedArgs, "impsub1p", base + "assimo");
+ putIfMissing(namedArgs, "impsub2p", base + "aste");
+ putIfMissing(namedArgs, "impsub3p", base + "assero");
+ // Imperative
+ putIfMissing(namedArgs, "imp2s", base + "a");
+ putIfMissing(namedArgs, "imp3s", base + "i");
+ putIfMissing(namedArgs, "imp1p", base + "iamo");
+ putIfMissing(namedArgs, "imp2p", base + "ate");
+ putIfMissing(namedArgs, "imp3p", base + "ino");
+
+
+ itConj(args, namedArgs);
+ }
+
+
+ private void putIfMissing(final Map<String, String> namedArgs, final String key,
+ final String value) {
+ final String oldValue = namedArgs.get(key);
+ if (oldValue == null || oldValue.length() == 0) {
+ namedArgs.put(key, value);
+ }
+ }
+
+ // TODO: check how ='' and =| are manifested....
+
+ private void putOrNullify(final Map<String, String> namedArgs, final String key,
+ final String value) {
+ final String oldValue = namedArgs.get(key);
+ if (oldValue == null/* || oldValue.length() == 0*/) {
+ namedArgs.put(key, value);
+ } else {
+ if (oldValue.equals("''")) {
+ namedArgs.put(key, "");
+ }
+ }
+ }
+
+ final List<String> listPrefixes = new ArrayList<String>();
+ final List<String> listLines = new ArrayList<String>();
+
+static final Pattern UNINDEXED_WIKI_TEXT = Pattern.compile(
+ "(first|second|third)-person (singular|plural)|" +
+ "present tense|" +
+ "imperative"
+ );
+
+ private void handleForeignListItem(final String foreignText, String title, final Collection<String> forms, final WikiTokenizer wikiTokenizer) {
+
+ final String prefix = wikiTokenizer.listItemPrefix();
+ if (prefix.length() > 1) {
+ System.err.println("Prefix too long: " + wikiTokenizer.token());
+ return;
+ }
+
+ listPrefixes.clear();
+ listLines.clear();
+ listPrefixes.add(prefix);
+ listLines.add(wikiTokenizer.listItemWikiText());
+ while(wikiTokenizer.nextToken() != null &&
+ wikiTokenizer.isNewline() ||
+ wikiTokenizer.isComment() ||
+ (wikiTokenizer.isListItem() &&
+ wikiTokenizer.listItemPrefix().length() > prefix.length() &&
+ wikiTokenizer.listItemPrefix().startsWith(prefix))) {
+ if (wikiTokenizer.isListItem()) {
+ listPrefixes.add(wikiTokenizer.listItemPrefix());
+ listLines.add(wikiTokenizer.listItemWikiText());
+ }
+ }
+ if (wikiTokenizer.nextToken() != null) {
+ wikiTokenizer.returnToLineStart();
+ }
+ System.out.println("list lines: " + listLines);
+ System.out.println("list prefixes: " + listPrefixes);
+
+ final PairEntry pairEntry = new PairEntry();
+ final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+
+ final String foreign = trim(title);
+
+ final StringBuilder englishBuilder = new StringBuilder();
+
+ final String mainLine = listLines.get(0);
+
+ final WikiTokenizer englishTokenizer = new WikiTokenizer(mainLine);
+ while (englishTokenizer.nextToken() != null) {
+ // TODO handle form of....
+ if (englishTokenizer.isPlainText()) {
+ englishBuilder.append(englishTokenizer.token());
+ enIndexBuilder.addEntryWithString(indexedEntry, englishTokenizer.token(), EntryTypeName.WIKTIONARY_ENGLISH_DEF);
+ } else if (englishTokenizer.isWikiLink()) {
+ final String text = englishTokenizer.wikiLinkText();
+ final String link = englishTokenizer.wikiLinkDest();
+ if (link != null) {
+ if (link.contains("#English")) {
+ englishBuilder.append(text);
+ enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+ } else if (link.contains("#") && this.langPattern.matcher(link).find()) {
+ englishBuilder.append(text);
+ otherIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_OTHER_LANG);
+ } else {
+ System.err.println("Special link: " + englishTokenizer.token());
+ // TODO: something here...
+ }
+ } else {
+ // link == null
+ englishBuilder.append(text);
+ if (!UNINDEXED_WIKI_TEXT.matcher(text).find()) {
+ enIndexBuilder.addEntryWithString(indexedEntry, text, EntryTypeName.WIKTIONARY_ENGLISH_DEF_WIKI_LINK);
+ }
+ }
+ } else if (englishTokenizer.isFunction()) {
+ final String name = englishTokenizer.functionName();
+ if (name.contains(" conjugation of ") ||
+ name.contains(" form of ") ||
+ name.contains(" feminine of ") ||
+ name.contains(" plural of ")) {
+ // Ignore these in the index, they're really annoying....
+ englishBuilder.append(englishTokenizer.token());
+ } else {
+ System.err.println("Unexpected function: " + englishTokenizer.token());
+ }
+ } else {
+ if (englishTokenizer.isComment() || englishTokenizer.isMarkup()) {
+ } else {
+ System.err.println("Unexpected definition text: " + englishTokenizer.token());
+ }
+ }
+ }
+ final String english = trim(englishBuilder.toString());
+ if (english.length() > 0) {
+ final Pair pair = new Pair(english, trim(foreignText), this.swap);
+ pairEntry.pairs.add(pair);
+ otherIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ for (final String form : forms) {
+ otherIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_FORM_SINGLE, EntryTypeName.WIKTIONARY_FORM_MULTI);
+ }
}
}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.List;
-import java.util.Map;
-
-
-public interface WikiCallback {
-
- void onComment(final String text);
-
- void onFormatBold(final boolean boldOn);
- void onFormatItalic(final boolean italicOn);
-
- void onWikiLink(final String[] args);
-
- void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs);
-
- // Will never contain a newline unless it's in a <pre>
- void onText(final String text);
-
- // Only at start of line.
- void onHeadingStart(final int depth);
- void onHeadingEnd(final int depth);
-
-
- void onNewLine();
- void onNewParagraph();
-
- void onListItemStart(final String header, final int[] section);
- void onListItemEnd(final String header, final int[] section);
-
- // Errors
- void onUnterminated(final String start, String rest);
- void onInvalidHeaderEnd(String rest);
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class WikiFunction {
-
- public int start;
- public int end;
- public String name = "";
- public final List<String> args = new ArrayList<String>();;
- public final Map<String,String> namedArgs = new LinkedHashMap<String, String>();
-
- private static final Pattern functionEvent = Pattern.compile("\\{\\{|\\[\\[|\\}\\}|\\]\\]|=|\\|");
-
- public static WikiFunction getFunction(String line) {
- final int start = line.indexOf("{{");
- if (start == -1) {
- return null;
- }
- final WikiFunction result = new WikiFunction();
- result.start = start;
-
- final Matcher matcher = functionEvent.matcher(line);
- int depth = 1;
- int end = start + 2;
- int lastPipe = end;
- int lastEquals = -1;
- while (end < line.length() && matcher.find(end)) {
- end = matcher.end();
- if (matcher.group().equals("{{") || matcher.group().equals("[[")) {
- ++depth;
- } else if (matcher.group().equals("}}") || matcher.group().equals("]]")) {
- --depth;
- if (depth == 0) {
- break;
- }
- } else if (matcher.group().equals("|") && depth == 1) {
- if (lastEquals != -1) {
- result.namedArgs.put(line.substring(lastPipe, lastEquals), line.substring(lastEquals + 1, matcher.start()));
- } else {
- result.args.add(line.substring(lastPipe, matcher.start()));
- }
- lastPipe = matcher.end();
- lastEquals = -1;
- } else if (matcher.group().equals("=") && depth == 1) {
- lastEquals = matcher.start();
- }
- }
- if (depth > 0) {
- System.err.println("Invalid function: " + line);
- return null;
- }
-
- if (lastEquals != -1) {
- result.namedArgs.put(line.substring(lastPipe, lastEquals), line.substring(lastEquals + 1, matcher.start()));
- } else {
- result.args.add(line.substring(lastPipe, matcher.start()));
- }
- result.end = matcher.end();
- if (result.args.size() > 0) {
- result.name = result.args.remove(0);
- } else {
- System.err.println("Funnction unnamed: " + line);
- }
-
- return result;
- }
-
- public String getArg(final int pos) {
- return (pos < args.size()) ? args.get(pos) : null;
- }
-
- public String getNamedArg(final String name) {
- return namedArgs.get(name);
- }
-
- public String replaceWith(final String line, final String sub) {
- return line.substring(0, start) + sub + line.substring(end);
- }
-
-
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-public class WikiHeading {
- public final int depth;
- public final String name;
- public final String prefix;
-
- public WikiHeading(int depth, String name, String prefix) {
- this.depth = depth;
- this.name = name;
- this.prefix = prefix;
- }
-
- public static WikiHeading getHeading(String line) {
- line = line.trim();
- if (!line.startsWith("=")) {
- return null;
- }
- int i = 0;
- for (; i < line.length() && line.charAt(i) == '='; ++i) {
- }
- final String prefix = line.substring(0, i);
- if (!line.substring(i).endsWith(prefix) || line.charAt(line.length() - i - 1) == '=') {
- System.err.println("Invalid heading: " + line);
- return null;
- }
- return new WikiHeading(i, line.substring(i, line.length() - i).trim(), prefix);
- }
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class WikiLineReader {
-
- private final List<String> lineStack = new ArrayList<String>();
-
- private final String wikiText;
- private int lineStart = 0;
-
- private static final Pattern wikiLineEvent = Pattern.compile("$|\\{\\{|\\[\\[|\\}\\}|\\]\\]|<!--|<pre>|<math>", Pattern.MULTILINE);
-
- private static final Pattern whitespace = Pattern.compile("\\s+");
-
- public WikiLineReader(final String wikiText) {
- this.wikiText = wikiText;
- }
-
- public String readLine() {
- if (stuffedLine != null) {
- final String line = stuffedLine;
- stuffedLine = null;
- return line;
- }
- while (lineStart < wikiText.length() &&
- Character.isWhitespace(wikiText.charAt(lineStart)) &&
- wikiText.charAt(lineStart) != '\n') {
- ++lineStart;
- }
- if (lineStart >= wikiText.length()) {
- return null;
- }
-
- int lineEnd = lineStart;
- lineStack.clear();
- int firstNewline = -1;
- final Matcher matcher = wikiLineEvent.matcher(wikiText);
- while (lineEnd < wikiText.length()) {
- if (!matcher.find(lineEnd)) {
- lineEnd = wikiText.length();
- break;
- }
- lineEnd = matcher.end();
- if (lineEnd == wikiText.length()) {
- break;
- }
- if (matcher.group().equals("")) {
- assert (wikiText.charAt(matcher.start()) == '\n'): "Invalid: " + wikiText.substring(matcher.start());
- ++lineEnd;
- if (lineStack.size() == 0) {
- break;
- } else {
- if (firstNewline == -1) {
- firstNewline = matcher.end();
- }
- }
- }
-
- if (matcher.group().equals("[[") || matcher.group().equals("{{")) {
- lineStack.add(matcher.group());
- } else if (matcher.group().equals("}}") || matcher.group().equals("]]")) {
- if (lineStack.size() > 0) {
- final String removed = lineStack.remove(lineStack.size() - 1);
- if (removed.equals("{{") && !matcher.group().equals("}}")) {
- System.err.println("Unmatched {{ error: " + wikiText.substring(lineStart));
- }
- if (removed.equals("[[") && !matcher.group().equals("]]")) {
- System.err.println("Unmatched [[ error: " + wikiText.substring(lineStart));
- }
- } else {
- System.err.println("Pop too many error: " + wikiText.substring(lineStart).replaceAll("\n", "\\n"));
- }
- } else if (matcher.group().equals("<!--")) {
- lineEnd = safeIndexOf(wikiText, lineEnd, "-->", "\n");
- } else if (matcher.group().equals("<pre>")) {
- lineEnd = safeIndexOf(wikiText, lineEnd, "</pre>", "\n");
- } else if (matcher.group().equals("<math>")) {
- lineEnd = safeIndexOf(wikiText, lineEnd, "</math>", "\n");
- }
- }
- if (lineStack.size() > 0 && firstNewline != -1) {
- lineEnd = firstNewline + 1;
- }
- final String result = wikiText.substring(lineStart, lineEnd);
- lineStart = lineEnd;
- return cleanUpLine(result);
- }
-
-
- static int safeIndexOf(final String s, final int start, final String target, final String backup) {
- int close = s.indexOf(target, start);
- if (close != -1) {
- return close + target.length();
- }
- close = s.indexOf(backup, start);
- if (close != -1) {
- return close + backup.length();
- }
- return s.length();
- }
-
- public static String cleanUpLine(String line) {
- int pos;
- while ((pos = line.indexOf("<!--")) != -1) {
- int end = line.indexOf("-->");
- if (end != -1) {
- line = line.substring(0, pos) + line.substring(end + 3);
- }
- }
- final Matcher matcher = whitespace.matcher(line);
- line = matcher.replaceAll(" ");
- line = line.trim();
- return line;
- }
-
- String stuffedLine = null;
- public void stuffLine(final String line) {
- assert stuffedLine == null;
- stuffedLine = line;
- }
-
-
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.Arrays;
-
-import junit.framework.TestCase;
-
-public class WikiLineReaderTest extends TestCase {
-
- public void testSimple() {
- final String wikiText =
- "Hi" + "\n" +
- "Hello thad you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
- "hi <!--" + "\n" +
- "multi-line" + "\n" +
- "# comment -->" + "\n" +
- "" + "\n" +
- "asdf\n" +
- "# {{template_in_list}}" + "\n" +
- "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list
- "here's [[some blah|some]] wikitext." + "\n" +
- "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
- "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
- "== Header 2 ==" + "\n" +
- "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
- "{{unterminated}" + "\n" +
- "[[unterminated]" + "\n" +
- "=== {{header-template}} ===" + "\n";
-
- final String[] expected = new String[] {
- "Hi",
- "Hello thad you're '''pretty''' cool '''''over''''' there.",
- "hi",
- "",
- "asdf",
- "# {{template_in_list}}",
- "[[wikitext]]:[[wikitext]]",
- "here's [[some blah|some]] wikitext.",
- "here's a {{template|this has an = sign|blah=2|blah2=3| blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text.",
- "== Header 2 ==",
- "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
- "{{unterminated}",
- "[[unterminated]",
- "=== {{header-template}} ===",
- };
-
- final WikiLineReader wikiLineReader = new WikiLineReader(wikiText);
- for (int i = 0; i < expected.length; ++i) {
- assertEquals(expected[i], wikiLineReader.readLine());
- }
- final String end = wikiLineReader.readLine();
- if (end != null) {
- System.out.println(end);
- }
- assertNull(end);
- }
-
- public void testWikiHeading() {
- assertNull(WikiHeading.getHeading(""));
- assertNull(WikiHeading.getHeading("="));
- assertNull(WikiHeading.getHeading("=="));
- assertNull(WikiHeading.getHeading("=a"));
- assertNull(WikiHeading.getHeading("=a=="));
- assertNull(WikiHeading.getHeading("===a=="));
- assertNull(WikiHeading.getHeading("===a===="));
- assertNull(WikiHeading.getHeading("a="));
- assertEquals("a", WikiHeading.getHeading("=a=").name);
- assertEquals(1, WikiHeading.getHeading("=a=").depth);
- assertEquals("aa", WikiHeading.getHeading("==aa==").name);
- assertEquals(2, WikiHeading.getHeading("==aa==").depth);
- }
-
-
- public void testWikiFunction() {
- assertNull(WikiFunction.getFunction(""));
- assertNull(WikiFunction.getFunction("[[asdf]]"));
- assertNull(WikiFunction.getFunction("asd [[asdf]]asdf "));
- assertEquals("a", WikiFunction.getFunction("{{a}}").name);
- assertEquals("a", WikiFunction.getFunction("{{a|b}}").name);
- assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name);
- assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name);
- assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name);
- assertEquals(Arrays.asList("a", "b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args);
- assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b"));
- assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer"));
- }
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import com.hughes.util.StringUtil;
-
-public class WikiParser {
-
- private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
- private static final Pattern listStart = Pattern.compile("^[*#;:]+");
- private static final Pattern pipeSplit = Pattern.compile("\\s*\\|\\s*");
- private static final Pattern whitespace = Pattern.compile("\\s+");
- private static final Pattern headerStart = Pattern.compile("^==+");
-
-
- static void parse(final String wikiText, final WikiCallback callback) {
-
- boolean boldOn = false;
- boolean italicOn = false;
- int insideHeaderDepth = -1;
- String lastListItem = null;
-
- final List<String> positionalArgs = new ArrayList<String>();
- final Map<String, String> namedArgs = new LinkedHashMap<String, String>();
-
- String rest = wikiText;
- while (rest.length() > 0) {
- final Matcher matcher = markup.matcher(rest);
- if (matcher.find()) {
- final int nextMarkupPos = matcher.start();
- if (nextMarkupPos != 0) {
- String text = rest.substring(0, nextMarkupPos);
- whitespace.matcher(text).replaceAll(" ");
- callback.onText(text);
- rest = rest.substring(nextMarkupPos);
- }
-
- if (rest.equals("")) {
- continue;
- } else if (rest.startsWith("\n")) {
- rest = rest.substring(1);
-
- if (insideHeaderDepth != -1) {
- throw new RuntimeException("barf");
- }
- if (lastListItem != null) {
- callback.onListItemEnd(lastListItem, null);
- }
-
- final Matcher headerMatcher = headerStart.matcher(rest);
- if (headerMatcher.find()) {
- lastListItem = null;
- insideHeaderDepth = headerMatcher.group().length();
- callback.onHeadingStart(insideHeaderDepth);
- rest = rest.substring(headerMatcher.group().length());
- continue;
- }
-
- final Matcher listStartMatcher = listStart.matcher(rest);
- if (listStartMatcher.find()) {
- lastListItem = listStartMatcher.group();
- callback.onListItemStart(lastListItem, null);
- rest = rest.substring(lastListItem.length());
- continue;
- } else if (lastListItem != null) {
- callback.onNewParagraph();
- lastListItem = null;
- }
-
- if (rest.startsWith("\n")) {
- callback.onNewParagraph();
- continue;
- }
- callback.onNewLine();
- } else if (rest.startsWith("'''")) {
- boldOn = !boldOn;
- callback.onFormatBold(boldOn);
- rest = rest.substring(3);
- } else if (rest.startsWith("''")) {
- italicOn = !italicOn;
- callback.onFormatItalic(italicOn);
- rest = rest.substring(2);
- } else if (rest.startsWith("{{")) {
- int end = StringUtil.nestedIndexOf(rest, 2, "{{", "}}");
- if (end == -1) {
- callback.onUnterminated("{{", rest);
- end = StringUtil.safeIndexOf(rest, "\n") - 2;
- }
- final String template = rest.substring(2, end).trim();
- final List<String> templateArray = new ArrayList<String>();
- contextSensitivePipeSplit(template, templateArray);
- positionalArgs.clear();
- namedArgs.clear();
- for (int i = 0; i < templateArray.size(); ++i) {
-
- int equalPos = -1;
- do {
- equalPos = templateArray.get(i).indexOf('=', equalPos + 1);
- } while (equalPos > 1 && templateArray.get(i).charAt(equalPos - 1) == ' ');
-
- if (equalPos == -1) {
- positionalArgs.add(templateArray.get(i));
- } else {
- namedArgs.put(templateArray.get(i).substring(0, equalPos), templateArray.get(i).substring(equalPos + 1));
- }
- }
- callback.onTemplate(positionalArgs, namedArgs);
- rest = rest.substring(end + 2);
- } else if (rest.startsWith("[[")) {
- int end = rest.indexOf("]]");
- if (end == -1) {
- callback.onUnterminated("[[", rest);
- end = StringUtil.safeIndexOf(rest, "\n") - 2;
- }
- final String wikiLink = rest.substring(2, end);
- final String[] args = pipeSplit.split(wikiLink);
- callback.onWikiLink(args);
- rest = rest.substring(end + 2);
- } else if (rest.startsWith("=")) {
- final String match = matcher.group(1) != null ? matcher.group(1) : matcher.group(2);
- if (insideHeaderDepth == -1) {
- } else {
- if (match.length() != insideHeaderDepth) {
- callback.onInvalidHeaderEnd(rest);
- return;
- }
- callback.onHeadingEnd(insideHeaderDepth);
- insideHeaderDepth = -1;
- }
- rest = rest.substring(match.length());
- } else if (rest.startsWith("<!--")) {
- int end = rest.indexOf("-->");
- if (end == -1) {
- callback.onUnterminated("<!--", rest);
- end = StringUtil.safeIndexOf(rest, "\n") - 3;
- }
- callback.onComment(rest.substring(4, end));
- rest = rest.substring(end + 3);
- } else if (rest.startsWith("<pre>")) {
- int end = rest.indexOf("</pre>");
- if (end == -1) {
- callback.onUnterminated("<pre>", rest);
- end = StringUtil.safeIndexOf(rest, "\n") - 6;
- }
- callback.onText(rest.substring(5, end));
- rest = rest.substring(end + 6);
- } else {
- throw new RuntimeException("barf: " + rest);
- }
- } // matcher.find()
- }
- }
-
- private static void contextSensitivePipeSplit(String template, final List<String> result) {
- int depth = 0;
- int lastStart = 0;
- for (int i = 1; i < template.length(); ) {
- if (template.charAt(i) == '|' && depth == 0) {
- final String s = template.substring(lastStart, i);
- result.add(s.trim());
- ++i;
- lastStart = i;
- } else if (template.startsWith("[[", i) || template.startsWith("{{", i)) {
- ++depth;
- i += 2;
- } else if (template.startsWith("]]", i) || template.startsWith("}}", i)) {
- --depth;
- if (depth < 0) {
- throw new RuntimeException("too many closings: " + template);
- }
- i += 2;
- } else {
- ++i;
- }
- }
- result.add(template.substring(lastStart).trim());
- }
-
- // ------------------------------------------------------------------------
-
- public static String simpleParse(final String wikiText) {
- final StringBuilderCallback callback = new StringBuilderCallback();
- parse(wikiText, callback);
- return callback.builder.toString();
- }
-
- static final class StringBuilderCallback implements WikiCallback {
-
- final StringBuilder builder = new StringBuilder();
-
- @Override
- public void onComment(String text) {
- }
-
- @Override
- public void onFormatBold(boolean boldOn) {
- }
-
- @Override
- public void onFormatItalic(boolean italicOn) {
- }
-
- @Override
- public void onWikiLink(String[] args) {
- builder.append(args[args.length - 1]);
- }
-
- @Override
- public void onTemplate(List<String> positionalArgs,
- Map<String, String> namedArgs) {
- builder.append("{{").append(positionalArgs).append(namedArgs).append("}}");
- }
-
- @Override
- public void onText(String text) {
- builder.append(text);
- }
-
- @Override
- public void onHeadingStart(int depth) {
- }
-
- @Override
- public void onHeadingEnd(int depth) {
- }
-
- @Override
- public void onNewLine() {
- }
-
- @Override
- public void onNewParagraph() {
- }
-
- @Override
- public void onListItemStart(String header, int[] section) {
- }
-
- @Override
- public void onListItemEnd(String header, int[] section) {
- }
-
- @Override
- public void onUnterminated(String start, String rest) {
- System.err.printf("onUnterminated: %s, %s\n", start, rest);
- }
-
- @Override
- public void onInvalidHeaderEnd(String rest) {
- throw new RuntimeException(rest);
- }
-
- }
-
-
-}
+++ /dev/null
-package com.hughes.android.dictionary.parser;
-
-import java.util.List;
-import java.util.Map;
-
-import junit.framework.TestCase;
-
-public class WikiParserTest extends TestCase {
-
- public void testSimple() {
- final String text =
- "Hi" + "\n" +
- "Hello ''thad'' you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
- "hi <!--" + "\n" +
- "multi-line" + "\n" +
- "# comment -->" + "\n" +
- "" + "\n" +
- "asdf\n" +
- "# li" + "\n" +
- "# li2" + "\n" +
- "# {{template_in_list}}" + "\n" +
- "## li2.2" + "\n" +
- "Hi again." + "\n" +
- "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list
- "here's [[some blah|some]] wikitext." + "\n" +
- "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
- "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
- "== Header 2 ==" + "\n" +
- "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
- "{{unterminated}" + "\n" +
-// "==== Header 4 ====" + "\n" +
-// "===== Header 5 =====" + "\n" +
- "=== {{header-template}} ===" + "\n";
-
- final String expected = "Hi Hello <i>thad</i> you're \n" +
- "comment: not \n" +
- " <b>pretty</b> cool <b><i>over</b></i> there. hi \n" +
- "comment:\n" +
- "multi-line\n" +
- "# comment \n" +
- "\n" +
- "\n" +
- " asdf\n" +
- "LIST (#) li\n" +
- "LIST (#) li2\n" +
- "LIST (#) \n" +
- "template:[template_in_list]{}\n" +
- "\n" +
- "LIST (##) li2.2\n" +
- "\n" +
- " Hi again. [[wikitext]]:[[wikitext]] here's [[some]] wikitext. here's a \n" +
- "template:[template, this has an = sign]{blah=2, blah2=3, blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}\n" +
- " and some more text.\n" +
- "HEADER Header 2 \n" +
- " \n" +
- "template:[some-func]{blah={{nested-func|n2}}, blah2=asdf}\n" +
- " \n" +
- "template:[unterminate]{}" + "\n" +
- "\n" +
- "HEADER \n" +
- "template:[header-template]{}\n" +
- " \n" +
- " ";
- final PrintWikiCallback callback = new PrintWikiCallback();
- WikiParser.parse(text, callback);
- assertEquals(expected, callback.builder.toString());
-
- }
-
-
- static final class PrintWikiCallback implements WikiCallback {
- final StringBuilder builder = new StringBuilder();
-
- @Override
- public void onComment(String text) {
- builder.append("\ncomment:").append(text).append("\n");
- }
-
- @Override
- public void onFormatBold(boolean boldOn) {
- builder.append(boldOn ? "<b>" : "</b>");
- }
-
- @Override
- public void onFormatItalic(boolean italicOn) {
- builder.append(italicOn ? "<i>" : "</i>");
- }
-
- @Override
- public void onWikiLink(String[] args) {
- builder.append("[[").append(args[args.length - 1]).append("]]");
- }
-
- @Override
- public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
- builder.append("\ntemplate:").append(positionalArgs).append(namedArgs).append("\n");
- }
-
- @Override
- public void onText(String text) {
- builder.append(text);
- }
-
- @Override
- public void onHeadingStart(int depth) {
- builder.append("\nHEADER");
- for (int i = 0; i < depth; ++i) {
- builder.append(" ");
- }
- }
-
- @Override
- public void onHeadingEnd(int depth) {
- builder.append("\n");
- }
-
- @Override
- public void onNewLine() {
- builder.append(" ");
- }
-
- @Override
- public void onNewParagraph() {
- builder.append("\n\n");
- }
-
- @Override
- public void onListItemStart(String header, int[] section) {
- builder.append("\n").append("LIST (").append(header).append(")");
- }
-
- @Override
- public void onListItemEnd(String header, int[] section) {
- //builder.append("\n");
- }
-
- @Override
- public void onUnterminated(String start, String rest) {
- //throw new RuntimeException("bad");
- }
-
- @Override
- public void onInvalidHeaderEnd(String rest) {
- throw new RuntimeException("bad");
- }
-
- }
-
-
-
-}
package com.hughes.android.dictionary.parser;
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class WikiTokenizer {
//private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
- private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|<!--|''|$)", Pattern.MULTILINE);
+ private static final Pattern wikiTokenEvent = Pattern.compile("(" +
+ "\\{\\{|\\}\\}|" +
+ "\\[\\[|\\]\\]|" +
+ "\\||" + // Need the | because we might have to find unescaped pipes
+ "=|" + // Need the = because we might have to find unescaped =
+ "<!--|" +
+ "''|" +
+ "$)", Pattern.MULTILINE);
private static final String listChars = "*#:;";
- final String wikiText;
- final Matcher matcher;
+ final String wikiText;
+ final Matcher matcher;
- boolean justReturnedNewline = true;
- int end = 0;
- int start = -1;
+ boolean justReturnedNewline = true;
+ int lastLineStart = 0;
+ int end = 0;
+ int start = -1;
+
+ final List<String> errors = new ArrayList<String>();
+ final List<String> tokenStack = new ArrayList<String>();
+
+
+ private String headingWikiText;
+ private int headingDepth;
+ private int listPrefixEnd;
+ private boolean isPlainText;
+ private boolean isMarkup;
+ private boolean isComment;
+ private boolean isFunction;
+ private boolean isWikiLink;
+ private int firstUnescapedPipePos;
+
+ private int lastUnescapedPipePos;
+ private int lastUnescapedEqualsPos;
+ private final List<String> positionArgs = new ArrayList<String>();
+ private final Map<String,String> namedArgs = new LinkedHashMap<String,String>();
+
- public String header;
- public int headerDepth;
-
- final List<String> tokenStack = new ArrayList<String>();
-
public WikiTokenizer(final String wikiText) {
this.wikiText = wikiText;
this.matcher = wikiTokenEvent.matcher(wikiText);
}
private void clear() {
- header = null;
- headerDepth = 0;
+ errors.clear();
tokenStack.clear();
+
+ headingWikiText = null;
+ headingDepth = -1;
+ listPrefixEnd = -1;
+ isPlainText = false;
+ isMarkup = false;
+ isComment = false;
+ isFunction = false;
+ isWikiLink = false;
+
+ firstUnescapedPipePos = -1;
+ lastUnescapedPipePos = -1;
+ lastUnescapedEqualsPos = -1;
+ positionArgs.clear();
+ namedArgs.clear();
+ }
+
+ public boolean isNewline() {
+ return justReturnedNewline;
+ }
+
+ public void returnToLineStart() {
+ end = start = lastLineStart;
+ justReturnedNewline = true;
+ }
+
+ public boolean isHeading() {
+ return headingWikiText != null;
+ }
+
+ public String headingWikiText() {
+ assert isHeading();
+ return headingWikiText;
+ }
+
+ public int headingDepth() {
+ assert isHeading();
+ return headingDepth;
+ }
+
+ public boolean isMarkup() {
+ return isMarkup;
+ }
+
+ public boolean isComment() {
+ return isComment;
+ }
+
+ public boolean isListItem() {
+ return listPrefixEnd != -1;
+ }
+
+ public String listItemPrefix() {
+ assert isListItem();
+ return wikiText.substring(start, listPrefixEnd);
+ }
+
+ public String listItemWikiText() {
+ assert isListItem();
+ return wikiText.substring(listPrefixEnd, end);
+ }
+
+ public boolean isFunction() {
+ return isFunction;
+ }
+
+ public String functionName() {
+ assert isFunction();
+ // "{{.."
+ if (firstUnescapedPipePos != -1) {
+ return wikiText.substring(start + 2, firstUnescapedPipePos);
+ }
+ return wikiText.substring(start + 2, end - 2);
+ }
+
+ public List<String> functionPositionArgs() {
+ return positionArgs;
+ }
+
+ public Map<String, String> functionNamedArgs() {
+ return namedArgs;
+ }
+
+ public boolean isPlainText() {
+ return isPlainText;
+ }
+
+ public boolean isWikiLink() {
+ return isWikiLink;
+ }
+
+ public String wikiLinkText() {
+ assert isWikiLink();
+ // "[[.."
+ if (lastUnescapedPipePos != -1) {
+ return wikiText.substring(lastUnescapedPipePos + 1, end - 2);
+ }
+ return wikiText.substring(start + 2, end - 2);
}
+ public String wikiLinkDest() {
+ assert isWikiLink();
+ // "[[.."
+ if (firstUnescapedPipePos != -1) {
+ return wikiText.substring(start + 2, firstUnescapedPipePos);
+ }
+ return null;
+ }
+
+ public boolean remainderStartsWith(final String prefix) {
+ return wikiText.startsWith(prefix, start);
+ }
+
+ public void nextLine() {
+ final int oldStart = start;
+ while(nextToken() != null && !isNewline()) {}
+ if (isNewline()) {
+ --end;
+ }
+ start = oldStart;
+ }
+
public WikiTokenizer nextToken() {
this.clear();
start = end;
+ if (justReturnedNewline) {
+ lastLineStart = start;
+ }
+
+ try {
+
final int len = wikiText.length();
if (start >= len) {
return null;
return this;
}
- if (justReturnedNewline) {
+ if (justReturnedNewline) {
+ justReturnedNewline = false;
+
final char firstChar = wikiText.charAt(end);
if (firstChar == '=') {
final int headerStart = end;
+ // Skip ===...
while (++end < len && wikiText.charAt(end) == '=') {}
final int headerTitleStart = end;
- while (++end < len && wikiText.charAt(end) != '=' && wikiText.charAt(end) != '\n') {}
+ headingDepth = headerTitleStart - headerStart;
+ // Skip non-=...
+ if (end < len) {
+ final int nextNewline = safeIndexOf(wikiText, end, "\n", "\n");
+ final int closingEquals = escapedFindEnd(end, "=");
+ if (wikiText.charAt(closingEquals - 1) == '=') {
+ end = closingEquals - 1;
+ } else {
+ end = nextNewline;
+ }
+ }
final int headerTitleEnd = end;
- while (++end < len && wikiText.charAt(end) == '=') {}
+ headingWikiText = wikiText.substring(headerTitleStart, headerTitleEnd);
+ // Skip ===...
+ while (end < len && ++end < len && wikiText.charAt(end) == '=') {}
final int headerEnd = end;
-
+ if (headerEnd - headerTitleEnd != headingDepth) {
+ errors.add("Mismatched header depth: " + token());
+ }
return this;
}
if (listChars.indexOf(firstChar) != -1) {
while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
- end = escapedFind(start, "\n");
+ listPrefixEnd = end;
+ end = escapedFindEnd(start, "\n");
return this;
}
}
- justReturnedNewline = false;
if (wikiText.startsWith("'''", start)) {
+ isMarkup = true;
end = start + 3;
return this;
}
if (wikiText.startsWith("''", start)) {
+ isMarkup = true;
end = start + 2;
return this;
}
if (wikiText.startsWith("[[", start)) {
- end = escapedFind(start + 2, "]]");
+ end = escapedFindEnd(start + 2, "]]");
+ isWikiLink = errors.isEmpty();
return this;
}
- if (wikiText.startsWith("{{", start)) {
- end = escapedFind(start + 2, "}}");
+ if (wikiText.startsWith("{{", start)) {
+ end = escapedFindEnd(start + 2, "}}");
+ isFunction = errors.isEmpty();
return this;
}
}
if (wikiText.startsWith("<!--", start)) {
+ isComment = true;
end = safeIndexOf(wikiText, start, "-->", "\n");
return this;
}
if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
- System.err.println("Close without open!");
+ errors.add("Close without open!");
end += 2;
return this;
}
+ if (wikiText.charAt(start) == '|' || wikiText.charAt(start) == '=') {
+ isPlainText = true;
+ ++end;
+ return this;
+ }
+
if (this.matcher.find(start)) {
end = this.matcher.start(1);
+ isPlainText = true;
if (end == start) {
- System.err.println(this.matcher.group());
+ errors.add("Empty group: " + this.matcher.group());
assert false;
}
return this;
end = wikiText.length();
return this;
+ } finally {
+ if (!errors.isEmpty()) {
+ System.err.println("Errors: " + errors + ", token=" + token());
+ }
+ }
+
}
public String token() {
- return wikiText.substring(start, end);
+ final String token = wikiText.substring(start, end);
+ assert token.equals("\n") || !token.endsWith("\n") : token;
+ return token;
}
- private int escapedFind(final int start, final String toFind) {
+ private int escapedFindEnd(final int start, final String toFind) {
assert tokenStack.isEmpty();
+ final boolean insideFunction = toFind.equals("}}");
+
int end = start;
while (end < wikiText.length()) {
if (matcher.find(end)) {
final String matchText = matcher.group();
final int matchStart = matcher.start();
+ assert matcher.end() > end || matchText.length() == 0: "Group=" + matcher.group();
if (matchText.length() == 0) {
assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
if (tokenStack.isEmpty() && toFind.equals("\n")) {
++end;
} else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
// The normal return....
+ if (insideFunction) {
+ addFunctionArg(insideFunction, matchStart);
+ }
return matcher.end();
} else if (matchText.equals("[[") || matchText.equals("{{")) {
tokenStack.add(matchText);
if (tokenStack.size() > 0) {
final String removed = tokenStack.remove(tokenStack.size() - 1);
if (removed.equals("{{") && !matcher.group().equals("}}")) {
- System.err.println("Unmatched {{ error: " + wikiText.substring(start));
+ errors.add("Unmatched {{ error: " + wikiText.substring(start));
return safeIndexOf(wikiText, start, "\n", "\n");
} else if (removed.equals("[[") && !matcher.group().equals("]]")) {
- System.err.println("Unmatched [[ error: " + wikiText.substring(start));
+ errors.add("Unmatched [[ error: " + wikiText.substring(start));
return safeIndexOf(wikiText, start, "\n", "\n");
}
} else {
- System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n"));
+ errors.add("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\\\n"));
// If we were looking for a newline
return safeIndexOf(wikiText, start, "\n", "\n");
}
+ } else if (matchText.equals("|")) {
+ if (tokenStack.isEmpty()) {
+ addFunctionArg(insideFunction, matchStart);
+ }
+ } else if (matchText.equals("=")) {
+ if (tokenStack.isEmpty()) {
+ lastUnescapedEqualsPos = matchStart;
+ }
+ // Do nothing. These can match spuriously, and if it's not the thing
+ // we're looking for, keep on going.
} else if (matchText.equals("<!--")) {
end = wikiText.indexOf("-->");
if (end == -1) {
- System.err.println("Unmatched <!-- error: " + wikiText.substring(start));
+ errors.add("Unmatched <!-- error: " + wikiText.substring(start));
+ return safeIndexOf(wikiText, start, "\n", "\n");
}
+ } else if (matchText.equals("''")) {
+ // Don't care.
} else {
assert false : "Match text='" + matchText + "'";
throw new IllegalStateException();
}
} else {
// Hmmm, we didn't find the closing symbol we were looking for...
- System.err.println("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+ errors.add("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
return safeIndexOf(wikiText, start, "\n", "\n");
}
- // Inside the while loop.
+ // Inside the while loop. Just go forward.
end = Math.max(end, matcher.end());
}
return end;
}
+ private void addFunctionArg(final boolean insideFunction, final int matchStart) {
+ if (firstUnescapedPipePos == -1) {
+ firstUnescapedPipePos = lastUnescapedPipePos = matchStart;
+ } else if (insideFunction) {
+ if (lastUnescapedEqualsPos > lastUnescapedPipePos) {
+ final String key = wikiText.substring(lastUnescapedPipePos + 1, lastUnescapedEqualsPos);
+ final String value = wikiText.substring(lastUnescapedEqualsPos + 1, matchStart);
+ namedArgs.put(key, value);
+ } else {
+ final String value = wikiText.substring(lastUnescapedPipePos + 1, matchStart);
+ positionArgs.add(value);
+ }
+ }
+ lastUnescapedPipePos = matchStart;
+ }
+
static int safeIndexOf(final String s, final int start, final String target, final String backup) {
int close = s.indexOf(target, start);
if (close != -1) {
- return close + target.length();
+ // Don't step over a \n.
+ return close + (target.equals("\n") ? 0 : target.length());
}
close = s.indexOf(backup, start);
if (close != -1) {
- return close + backup.length();
+ return close + (backup.equals("\n") ? 0 : backup.length());
}
return s.length();
}
+ public static String toPlainText(String sense) {
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(sense);
+ final StringBuilder builder = new StringBuilder();
+ while (wikiTokenizer.nextToken() != null) {
+ if (wikiTokenizer.isPlainText()) {
+ builder.append(wikiTokenizer.token());
+ } else if (wikiTokenizer.isWikiLink()) {
+ builder.append(wikiTokenizer.wikiLinkText());
+ } else if (wikiTokenizer.isNewline()) {
+ builder.append("\n");
+ } else if (wikiTokenizer.isFunction()) {
+ builder.append(wikiTokenizer.token());
+ }
+ }
+ return builder.toString();
+ }
+
}
import junit.framework.TestCase;
public class WikiTokenizerTest extends TestCase {
+
+ public void testWikiLink() {
+ String wikiText;
+
+ wikiText = "[[abc]]";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+ assertEquals(null, new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+ wikiText = "[[abc|def]]";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+ assertEquals("def", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+ wikiText = "[[abc|def|ghi{{a|=2}}p]]";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isWikiLink());
+ assertEquals("ghi{{a|=2}}p", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkDest());
+
+ wikiText = "[[abc]][[def]]";
+ assertEquals("[[abc]]", new WikiTokenizer(wikiText).nextToken().token());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().wikiLinkText());
+ assertEquals("def", new WikiTokenizer(wikiText).nextToken().nextToken().wikiLinkText());
+
+ }
+
+ public void testWikiList() {
+ String wikiText;
+
+ wikiText = "* This is ''bold''' asdf.";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ }
+
+ public void testFunction() {
+ String wikiText;
+
+ wikiText = "{{abc}}";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionPositionArgs().size());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+ wikiText = "{{abc|def}}";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+ assertEquals(Arrays.asList("def"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+ wikiText = "{{abc|d[[|]]ef|ghi}}";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+ assertEquals(Arrays.asList("d[[|]]ef", "ghi"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+
+ wikiText = "{{abc|arg1=101|ghi|arg2=202|arg3={{n1|n2=7|n3}}|{{d}}}}";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isFunction());
+ assertEquals("abc", new WikiTokenizer(wikiText).nextToken().functionName());
+ assertEquals(Arrays.asList("ghi", "{{d}}"), new WikiTokenizer(wikiText).nextToken().functionPositionArgs());
+ assertEquals(3, new WikiTokenizer(wikiText).nextToken().functionNamedArgs().size());
+ assertEquals("101", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg1"));
+ assertEquals("202", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg2"));
+ assertEquals("{{n1|n2=7|n3}}", new WikiTokenizer(wikiText).nextToken().functionNamedArgs().get("arg3"));
+
+
+ }
+ public void testReturn() {
+ String wikiText;
+
+ wikiText = "hello\n=Heading=\nhello2";
+
+ final WikiTokenizer tokenizer = new WikiTokenizer(wikiText);
+
+ assertEquals("hello", tokenizer.nextToken().token());
+ tokenizer.returnToLineStart();
+ assertEquals("hello", tokenizer.nextToken().token());
+ assertEquals("\n", tokenizer.nextToken().token());
+ tokenizer.returnToLineStart();
+ assertEquals("hello", tokenizer.nextToken().token());
+ assertEquals("\n", tokenizer.nextToken().token());
+
+ assertEquals("=Heading=", tokenizer.nextToken().token());
+ tokenizer.returnToLineStart();
+ assertEquals("=Heading=", tokenizer.nextToken().token());
+ assertEquals("\n", tokenizer.nextToken().token());
+ tokenizer.returnToLineStart();
+ assertEquals("=Heading=", tokenizer.nextToken().token());
+ assertEquals("\n", tokenizer.nextToken().token());
+
+ assertEquals("hello2", tokenizer.nextToken().token());
+ assertEquals(null, tokenizer.nextToken());
+ tokenizer.returnToLineStart();
+ assertEquals("hello2", tokenizer.nextToken().token());
+ assertEquals(null, tokenizer.nextToken());
+
+
+ }
+
+ public void testWikiHeading() {
+ String wikiText;
+
+ wikiText = "==";
+ assertEquals("==", new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+ assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
+ assertEquals("", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+
+ wikiText = "=a";
+ assertEquals("=a", new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+ assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+ wikiText = "=a==";
+ assertEquals("=a==", new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+ assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+ wikiText = "a=";
+ assertEquals("a", new WikiTokenizer(wikiText).nextToken().token());
+ assertFalse(new WikiTokenizer(wikiText).nextToken().isHeading());
+
+ wikiText = "=a=";
+ assertEquals("=a=", new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+ assertEquals(1, new WikiTokenizer(wikiText).nextToken().headingDepth());
+ assertEquals("a", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
+
+ wikiText = "==aa[[|=]] {{|={{=}} }}==";
+ assertEquals(wikiText, new WikiTokenizer(wikiText).nextToken().token());
+ assertTrue(new WikiTokenizer(wikiText).nextToken().isHeading());
+ assertEquals(2, new WikiTokenizer(wikiText).nextToken().headingDepth());
+ assertEquals("aa[[|=]] {{|={{=}} }}", new WikiTokenizer(wikiText).nextToken().headingWikiText());
+ assertEquals(0, new WikiTokenizer(wikiText).nextToken().errors.size());
+ }
+
+
+
public void testSimple() {
final String wikiText =
"Hi" + "\n" +
- "Hello thad you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+ "Hello =thad| you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
"hi <!--" + "\n" +
"multi-line" + "\n" +
"# comment -->" + "\n" +
final String[] expectedTokens = new String[] {
"Hi",
"\n",
- "Hello thad you're ",
+ "Hello ",
+ "=",
+ "thad",
+ "|",
+ " you're ",
"<!-- not -->",
" ",
"'''",
"\n",
"{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
"\n",
- "{{mismatched]]\n",
- "[[mismatched}}\n",
+ "{{mismatched]]",
+ "\n",
+ "[[mismatched}}",
+ "\n",
"{extraterminated",
"}}",
"\n",
assertEquals(Arrays.asList(expectedTokens), actualTokens);
}
- public void testWikiHeading() {
- assertNull(WikiHeading.getHeading(""));
- assertNull(WikiHeading.getHeading("="));
- assertNull(WikiHeading.getHeading("=="));
- assertNull(WikiHeading.getHeading("=a"));
- assertNull(WikiHeading.getHeading("=a=="));
- assertNull(WikiHeading.getHeading("===a=="));
- assertNull(WikiHeading.getHeading("===a===="));
- assertNull(WikiHeading.getHeading("a="));
- assertEquals("a", WikiHeading.getHeading("=a=").name);
- assertEquals(1, WikiHeading.getHeading("=a=").depth);
- assertEquals("aa", WikiHeading.getHeading("==aa==").name);
- assertEquals(2, WikiHeading.getHeading("==aa==").depth);
- }
-
-
- public void testWikiFunction() {
- assertNull(WikiFunction.getFunction(""));
- assertNull(WikiFunction.getFunction("[[asdf]]"));
- assertNull(WikiFunction.getFunction("asd [[asdf]]asdf "));
- assertEquals("a", WikiFunction.getFunction("{{a}}").name);
- assertEquals("a", WikiFunction.getFunction("{{a|b}}").name);
- assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name);
- assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name);
- assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name);
- assertEquals(Arrays.asList("b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args);
- assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b"));
- assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer"));
- }
-
}