if (enIndex < 0 || enIndex >= 2) {
fatalError("Must be 1 or 2: " + prefix + "EnIndex");
}
- new EnWiktionaryXmlParser(dictionaryBuilder, langPattern, langCodePattern, enIndex).parse(file, Integer.parseInt(pageLimit));
+ new EnWiktionaryXmlParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex),
+ langPattern, langCodePattern, enIndex != 0).parse(file, Integer.parseInt(pageLimit));
} else {
fatalError("Invalid or missing input format: " + inputFormat);
}
public static void main(final String[] args) throws Exception {
- DictionaryBuilder.main(new String[] {
- "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
- "--lang1=DE",
- "--lang2=EN",
- "--dictInfo=@dictInputs/de-en_chemnitz.info",
-
- "--input1=dictInputs/de-en_chemnitz.txt",
- "--input1Name=chemnitz",
- "--input1Charset=UTF8",
- "--input1Format=chemnitz",
- });
-
Lang[] langs1 = new Lang[] {
new Lang("^English$", "EN"),
- new Lang("^German$", "DE"),
+ //new Lang("^German$", "DE"),
};
Lang[] langs2 = new Lang[] {
new Lang("^Italian$", "IT"),
} // langs2
} // langs1
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@dictInputs/de-en_chemnitz.info",
+
+ "--input1=dictInputs/de-en_chemnitz.txt",
+ "--input1Name=chemnitz",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+ });
+
DictionaryBuilder.main(new String[] {
"--dictOut=dictOutputs/de-en_all.quickdic",
"--lang1=DE",
"--lang2=EN",
"--dictInfo=SomeWikiData",
+ /*
"--input3=wikiSplit/english.data",
- "--input3Name=enwiktionary.italian",
+ "--input3Name=enwiktionary.english",
"--input3Format=enwiktionary",
"--input3LangPattern=Italian",
"--input3LangCodePattern=it",
"--input3EnIndex=2",
"--input3PageLimit=1000",
-
+*/
"--input4=wikiSplit/italian.data",
"--input4Name=enwiktionary.italian",
"--input4Format=enwiktionary",
import java.util.TreeMap;
import com.hughes.android.dictionary.engine.Index.IndexEntry;
+import com.hughes.android.dictionary.parser.DictFileParser;
public class IndexBuilder {
// System.out.println("TOKEN: " + tokenData.token);
for (final Map.Entry<EntryTypeName, List<IndexedEntry>> typeToEntry : tokenData.typeToEntries.entrySet()) {
for (final IndexedEntry entryData : typeToEntry.getValue()) {
+ if (entryData.index() == -1) {
+ entryData.addToDictionary(dictionaryBuilder.dictionary);
+ assert entryData.index() >= 0;
+ }
if (tokenEntryDatas.add(entryData)) {
rows.add(new PairEntry.Row(entryData.index(), rows.size(), index));
++numRows;
return entries;
}
- public void addEntryWithTokens(final IndexedEntry entryData, final Set<String> tokens,
+ public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set<String> tokens,
final EntryTypeName entryTypeName) {
for (final String token : tokens) {
- getOrCreateEntries(token, entryTypeName).add(entryData);
+ getOrCreateEntries(token, entryTypeName).add(indexedEntry);
}
}
-
+ public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+ final EntryTypeName singleTokenEntryTypeName, final EntryTypeName multiTokenEntryTypeName) {
+ final Set<String> tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR);
+ addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? singleTokenEntryTypeName : multiTokenEntryTypeName);
+ }
+
+ public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString,
+ final EntryTypeName entryTypeName) {
+ addEntryWithString(indexedEntry, untokenizedString, entryTypeName, entryTypeName);
+ }
}
--- /dev/null
+/**
+ *
+ */
+package com.hughes.android.dictionary.engine;
+
+import com.hughes.util.IndexedObject;
+
+public class IndexedEntry extends IndexedObject {
+ public IndexedEntry(final AbstractEntry entry) {
+ super(-1);
+ this.entry = entry;
+ }
+ AbstractEntry entry;
+
+ public void addToDictionary(Dictionary dictionary) {
+ assert index == -1;
+ index = entry.addToDictionary(dictionary);
+ }
+}
\ No newline at end of file
-package com.hughes.android.dictionary;
+package com.hughes.android.dictionary.engine;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}");
static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+");
- static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
+ public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+");
static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$");
import java.util.regex.Pattern;
import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.IndexedEntry;
+import com.hughes.android.dictionary.engine.PairEntry;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
public class EnWiktionaryXmlParser {
"Particle|Interjection|Pronominal adverb" +
"Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
- final DictionaryBuilder dictBuilder;
-
- final IndexBuilder[] indexBuilders;
+ final IndexBuilder enIndexBuilder;
+ final IndexBuilder otherIndexBuilder;
final Pattern langPattern;
final Pattern langCodePattern;
- final int enIndexBuilder;
+ final boolean swap;
- public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern langPattern, final Pattern langCodePattern, final int enIndexBuilder) {
- this.dictBuilder = dictBuilder;
- this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
+ public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) {
+ this.enIndexBuilder = enIndexBuilder;
+ this.otherIndexBuilder = otherIndexBuilder;
this.langPattern = langPattern;
this.langCodePattern = langCodePattern;
- this.enIndexBuilder = enIndexBuilder;
+ this.swap = swap;
}
if (heading.replaceAll("=", "").equals("English")) {
doEnglishWord(title, text);
} else {
- //doForeignWord(title, text);
+ doForeignWord(title, text);
}
} // endPage()
while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
if (wikiFunction.name.equals("trans-top")) {
sense = null;
- if (wikiFunction.args.size() >= 2) {
- sense = wikiFunction.args.get(1);
+ if (wikiFunction.args.size() >= 1) {
+ sense = wikiFunction.args.get(0);
//System.out.println("Sense: " + sense);
}
} else if (wikiFunction.name.equals("trans-bottom")) {
if (colonIndex == -1) {
continue;
}
+
final String lang = line.substring(0, colonIndex);
if (!this.langPattern.matcher(lang).find()) {
continue;
}
- String rest = line.substring(colonIndex + 1);
- final StringBuilder lineText = new StringBuilder();
+ String rest = line.substring(colonIndex + 1).trim();
+ doTranslationLine(line, title, sense, rest);
- boolean ttbc = false;
- WikiFunction wikiFunction;
- while ((wikiFunction = WikiFunction.getFunction(line)) != null) {
- if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
- if (wikiFunction.args.size() < 2) {
- System.err.println("{{t}} with too few args: " + line + ", title=" + title);
- continue;
- }
- final String langCode = wikiFunction.getArg(0);
- if (this.langCodePattern.matcher(langCode).matches()) {
- final String word = wikiFunction.getArg(1);
- final String gender = wikiFunction.getArg(2);
- final String transliteration = wikiFunction.getNamedArg("tr");
- }
- } else if (wikiFunction.name.equals("qualifier")) {
- String qualifier = wikiFunction.getArg(0);
- } else if (encodings.contains(wikiFunction.name)) {
- rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
- wikiFunction = null;
- } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
- String gender = wikiFunction.name;
- for (int i = 0; i < wikiFunction.args.size(); ++i) {
- gender += "|" + wikiFunction.getArg(i);
- }
- rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
- wikiFunction = null;
- } else if (wikiFunction.name.equals("g")) {
- rest = wikiFunction.replaceWith(rest, "{g}");
- wikiFunction = null;
- } else if (wikiFunction.name.equals("l")) {
- // encodes text in various langs.
- rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(1));
- // TODO: transliteration
- wikiFunction = null;
- } else if (wikiFunction.name.equals("term")) {
- // cross-reference to another dictionary
- rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0));
- // TODO: transliteration
- wikiFunction = null;
- } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
- // TODO: put this text aside to use it.
- rest = wikiFunction.replaceWith(rest, "[" + wikiFunction.getArg(0) + "]");
- wikiFunction = null;
- } else if (wikiFunction.name.equals("ttbc")) {
- ttbc = true;
- } else if (wikiFunction.name.equals("trreq")) {
- } else if (wikiFunction.name.equals("not used")) {
- rest = wikiFunction.replaceWith(rest, "[not used]");
- wikiFunction = null;
- } else if (wikiFunction.name.equals("t-image")) {
- // American sign language
- } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
- rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}");
- wikiFunction = null;
- } else {
- System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
- }
- if (wikiFunction != null) {
- rest = wikiFunction.replaceWith(rest, "");
- }
- }
} else if (line.equals("")) {
} else if (line.startsWith(":")) {
} else if (line.startsWith("[[") && line.endsWith("]]")) {
}
+ private void doTranslationLine(final String line, final String title, final String sense, String rest) {
+
+ // Good chance we'll actually file this one...
+ final PairEntry pairEntry = new PairEntry();
+ final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
+
+ final StringBuilder otherText = new StringBuilder();
+
+ WikiFunction wikiFunction;
+ while ((wikiFunction = WikiFunction.getFunction(rest)) != null) {
+ if (wikiFunction.start > 0) {
+ String plainText = rest.substring(0, wikiFunction.start);
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
+ }
+ rest = rest.substring(wikiFunction.end);
+
+ if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) {
+ if (wikiFunction.args.size() < 2) {
+ System.err.println("{{t}} with too few args: " + line + ", title=" + title);
+ continue;
+ }
+ final String langCode = wikiFunction.getArg(0);
+ if (this.langCodePattern.matcher(langCode).matches()) {
+ final String word = wikiFunction.getArg(1);
+ final String gender = wikiFunction.getArg(2);
+ final String transliteration = wikiFunction.getNamedArg("tr");
+ if (otherText.length() > 0) {
+ otherText.append("");
+ }
+ otherText.append(word);
+ otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ if (gender != null) {
+ otherText.append(String.format(" {%s}", gender));
+ }
+ if (transliteration != null) {
+ otherText.append(String.format(" (tr. %s)", transliteration));
+ otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION);
+ }
+ }
+ } else if (wikiFunction.name.equals("qualifier")) {
+ String qualifier = wikiFunction.getArg(0);
+ if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) {
+ System.err.println("weird qualifier: " + line);
+ }
+ otherText.append("(").append(qualifier).append(")");
+ } else if (encodings.contains(wikiFunction.name)) {
+ otherText.append("").append(wikiFunction.getArg(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+ } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) {
+ otherText.append("{");
+ otherText.append(wikiFunction.name);
+ for (int i = 0; i < wikiFunction.args.size(); ++i) {
+ otherText.append("|").append(wikiFunction.getArg(i));
+ }
+ otherText.append("}");
+ } else if (wikiFunction.name.equals("g")) {
+ otherText.append("{g}");
+ } else if (wikiFunction.name.equals("l")) {
+ // encodes text in various langs.
+ // lang is arg 0.
+ otherText.append("").append(wikiFunction.getArg(1));
+ otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (wikiFunction.name.equals("term")) {
+ // cross-reference to another dictionary
+ otherText.append("").append(wikiFunction.getArg(0));
+ otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+ // TODO: transliteration
+ } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) {
+ // TODO: put this text aside to use it.
+ otherText.append("[").append(wikiFunction.getArg(0)).append("]");
+ otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT);
+ } else if (wikiFunction.name.equals("ttbc")) {
+ } else if (wikiFunction.name.equals("trreq")) {
+ } else if (wikiFunction.name.equals("not used")) {
+ otherText.append("(not used)");
+ } else if (wikiFunction.name.equals("t-image")) {
+ // American sign language
+ } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) {
+ otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}");
+ } else {
+ System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title);
+ }
+ }
+ String plainText = rest;
+ otherText.append("").append(plainText);
+ otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT);
+
+ StringBuilder englishText = new StringBuilder();
+
+ englishText.append(title);
+ if (sense != null) {
+ englishText.append(" (").append(sense).append(")");
+ enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE);
+ }
+ if (pos != null) {
+ englishText.append(" (").append(pos.toLowerCase()).append(")");
+ }
+ enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+
+ final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap);
+ pairEntry.pairs.add(pair);
+ assert (pairsAdded.add(pair.toString()));
+ if (pair.toString().equals("libero {m} :: free (adjective)")) {
+ System.out.println();
+ }
+
+ }
+
+ Set<String> pairsAdded = new LinkedHashSet<String>();
+
// -------------------------------------------------------------------------
private void doForeignWord(String title, String text) {
while ((line = wikiLineReader.readLine()) != null) {
final WikiHeading wikiHeading = WikiHeading.getHeading(line);
if (wikiHeading != null) {
-
if (wikiHeading.name.equals("Translations")) {
System.err.println("Translations not in English section: " + title);
} else if (wikiHeading.name.equals("Pronunciation")) {
//doPronunciation(wikiLineReader);
} else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) {
-
+ doPartOfSpeech(title, wikiHeading, wikiLineReader);
+ }
+ }
+ }
+ }
+
+
+ private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) {
+ String line;
+ System.out.println("***" + title);
+ System.out.println(posHeading.name);
+ while ((line = wikiLineReader.readLine()) != null) {
+ WikiHeading heading = WikiHeading.getHeading(line);
+ if (heading != null) {
+ if (heading.depth <= posHeading.depth) {
+ wikiLineReader.stuffLine(line);
+ return;
}
}
+ System.out.println(line);
+
+
}
}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiWord.FormOf;
+import com.hughes.android.dictionary.parser.WikiWord.Translation;
+import com.hughes.util.ListUtil;
+import com.hughes.util.StringUtil;
+
+public class EnWiktionaryXmlParserOld extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+
+ static final Pattern partOfSpeechHeader = Pattern.compile(
+ "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+ "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+ "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+ "Ligature|Idiom|Phrase|" +
+ // These are @deprecated:
+ "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+ "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" +
+ // These are extras I found:
+ "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" +
+ "Particle|Interjection|Pronominal adverb" +
+ "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
+
+ static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
+
+ final DictionaryBuilder dictBuilder;
+
+ final IndexBuilder[] indexBuilders;
+ final Pattern[] langPatterns;
+ final int enIndexBuilder;
+
+ StringBuilder titleBuilder;
+ StringBuilder textBuilder;
+ StringBuilder currentBuilder = null;
+
+ static void assertTrue(final boolean condition) {
+ assertTrue(condition, "");
+ }
+
+ static void assertTrue(final boolean condition, final String message) {
+ if (!condition) {
+ System.err.println("Assertion failed, message: " + message);
+ new RuntimeException().printStackTrace(System.err);
+ }
+ }
+
+ public EnWiktionaryXmlParserOld(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) {
+ assertTrue(langPatterns.length == 2);
+ this.dictBuilder = dictBuilder;
+ this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]);
+ this.langPatterns = langPatterns;
+ this.enIndexBuilder = enIndexBuilder;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ titleBuilder = new StringBuilder();
+
+ // Start with "\n" to better match certain strings.
+ textBuilder = new StringBuilder("\n");
+ } else if ("title".equals(qName)) {
+ currentBuilder = titleBuilder;
+ } else if ("text".equals(qName)) {
+ currentBuilder = textBuilder;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (currentBuilder != null) {
+ currentBuilder.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ endPage();
+ }
+ }
+
+
+ public void parse(final File file) throws ParserConfigurationException,
+ SAXException, IOException {
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(file, this);
+ }
+
+ int pageCount = 0;
+ private void endPage() {
+ title = titleBuilder.toString();
+ ++pageCount;
+ if (pageCount % 1000 == 0) {
+ System.out.println("pageCount=" + pageCount);
+ }
+ if (title.startsWith("Wiktionary:") ||
+ title.startsWith("Template:") ||
+ title.startsWith("Appendix:") ||
+ title.startsWith("Category:") ||
+ title.startsWith("Index:") ||
+ title.startsWith("MediaWiki:") ||
+ title.startsWith("TransWiki:") ||
+ title.startsWith("Citations:") ||
+ title.startsWith("Concordance:") ||
+ title.startsWith("Help:")) {
+ return;
+ }
+ currentDepth = 0;
+ words.clear();
+ currentHeading = null;
+ insidePartOfSpeech = false;
+// System.err.println("Working on page: " + title);
+ try {
+ WikiParser.parse(textBuilder.toString(), this);
+ } catch (Throwable e) {
+ System.err.println("Failure on page: " + title);
+ e.printStackTrace(System.err);
+ }
+
+ for (final WikiWord word : words) {
+ word.wikiWordToQuickDic(dictBuilder, enIndexBuilder);
+ } // WikiWord
+
+ } // endPage()
+
+
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+
+ /**
+ * Two things can happen:
+ *
+ * We can be in a ==German== section. There we will see English definitions.
+ * Each POS should get its own QuickDic entry. Pretty much everything goes
+ * in.
+ *
+ * Or we can be in an ==English== section with English definitions
+ * and maybe see translations for languages we care about.
+ *
+ * In either case, we need to differentiate the subsections (Noun, Verb, etc.)
+ * into separate QuickDic entries, but that's tricky--how do we know when we
+ * found a subsection? Just ignore anything containing pronunciation and
+ * etymology?
+ *
+ * How do we decide when to seal the deal on an entry?
+ *
+ * Would be nice if the parser told us about leaving sections....
+ *
+ *
+ */
+
+ String title;
+ String currentHeading;
+ int currentDepth;
+ final List<WikiWord> words = new ArrayList<WikiWord>();
+ WikiWord currentWord;
+ WikiWord.PartOfSpeech currentPartOfSpeech;
+ WikiWord.TranslationSense currentTranslationSense;
+ boolean insidePartOfSpeech;
+
+ StringBuilder wikiBuilder = null;
+
+ @Override
+ public void onWikiLink(String[] args) {
+ if (wikiBuilder == null) {
+ return;
+ }
+ wikiBuilder.append(args[args.length - 1]);
+ }
+
+ // ttbc: translations to be checked.
+ static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
+ "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
+ "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
+ "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
+ static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
+ static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
+ static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
+
+ @Override
+ public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+ if (positionalArgs.isEmpty()) {
+ // This happens very rarely with special templates.
+ return;
+ }
+ final String name = positionalArgs.get(0);
+
+ namedArgs.remove("lang");
+ namedArgs.remove("nocat");
+ namedArgs.remove("nocap");
+ namedArgs.remove("sc");
+
+ // Pronunciation
+ if (currentWord != null) {
+ if (name.equals("a")) {
+ // accent tag
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
+ return;
+ }
+
+ if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) {
+ namedArgs.remove("lang");
+ for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) {
+ final String pron = namedArgs.remove("" + i);
+ if (pron != null) {
+ positionalArgs.add(pron);
+ } else {
+ if (i > 10) {
+ break;
+ }
+ }
+ }
+ if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) {
+ System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString());
+ }
+ if (currentWord.currentPronunciation == null) {
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
+ }
+ if (currentWord.currentPronunciation.length() > 0) {
+ currentWord.currentPronunciation.append("; ");
+ }
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i > 1) {
+ currentWord.currentPronunciation.append(",");
+ }
+ final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
+ currentWord.currentPronunciation.append(pron).append("");
+ }
+ currentWord.currentPronunciation.append(" (").append(name).append(")");
+ return;
+ }
+
+ if (name.equals("qualifier")) {
+ //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString());
+ if (wikiBuilder == null) {
+ return;
+ }
+ wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+ return;
+ }
+
+ if (name.equals("...")) {
+ // Skipping any elided text for brevity.
+ wikiBuilder.append("...");
+ return;
+ }
+
+ if (passThroughTemplates.contains(name)) {
+ assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs);
+ wikiBuilder.append(name);
+ return;
+ }
+
+ if (ignoreTemplates.contains(name)) {
+ return;
+ }
+
+ if ("Pronunciation".equals(currentHeading)) {
+ System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs);
+ return;
+ }
+ } // Pronunciation
+
+ // Part of speech
+ if (insidePartOfSpeech) {
+
+ // form of
+ if (name.equals("form of")) {
+ namedArgs.remove("sc");
+ if (positionalArgs.size() < 3 || positionalArgs.size() > 4) {
+ System.err.println("Invalid form of.");
+ }
+ final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3);
+ final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1));
+ currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token));
+ return;
+ }
+
+ // The fallback plan: append the template!
+ if (wikiBuilder != null) {
+ wikiBuilder.append("{");
+ boolean first = true;
+ for (final String arg : positionalArgs) {
+ if (!first) {
+ wikiBuilder.append(", ");
+ }
+ first = false;
+ wikiBuilder.append(arg);
+ }
+ // This one isn't so useful.
+ for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
+ if (!first) {
+ wikiBuilder.append(", ");
+ }
+ first = false;
+ wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue());
+ }
+ wikiBuilder.append("}");
+ }
+
+ //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs);
+ return;
+ } // Part of speech
+
+
+ // Translations
+ if (name.equals("trans-top")) {
+ assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title);
+
+ if (currentPartOfSpeech == null) {
+ assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(), title);
+ System.err.println("Assuming last part of speech for non-nested translation section: " + title);
+ currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech);
+ }
+
+ currentTranslationSense = new WikiWord.TranslationSense();
+ currentPartOfSpeech.translationSenses.add(currentTranslationSense);
+ if (positionalArgs.size() > 1) {
+ currentTranslationSense.sense = positionalArgs.get(1);
+ }
+ return;
+ } // Translations
+
+ if (wikiBuilder == null) {
+ return;
+ }
+ if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
+ assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString());
+ wikiBuilder.append("{");
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ wikiBuilder.append(i > 1 ? "," : "");
+ wikiBuilder.append(positionalArgs.get(i));
+ }
+ wikiBuilder.append(name).append("}");
+
+ } else if (name.equals("p")) {
+ assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty());
+ wikiBuilder.append("pl.");
+
+ } else if (name.equals("s")) {
+ assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title);
+ wikiBuilder.append("sg.");
+
+ } else if (grammarTemplates.contains(name)) {
+ assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+ wikiBuilder.append(name).append(".");
+
+ } else if (name.equals("l")) {
+ // This template is designed to generate a link to a specific language-section on the target page.
+ wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
+
+ } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
+ if (positionalArgs.size() > 2) {
+ wikiBuilder.append(positionalArgs.get(2));
+ }
+ for (int i = 3; i < positionalArgs.size(); ++i) {
+ wikiBuilder.append(i == 3 ? " {" : ",");
+ wikiBuilder.append(positionalArgs.get(i));
+ wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : "");
+ }
+ final String transliteration = namedArgs.remove("tr");
+ if (transliteration != null) {
+ wikiBuilder.append(" (").append(transliteration).append(")");
+ }
+
+ } else if (name.equals("trreq")) {
+ wikiBuilder.append("{{trreq}}");
+
+ } else if (name.equals("qualifier")) {
+ //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
+ wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+
+ } else if (useRemainingArgTemplates.contains(name)) {
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i != 1) {
+ wikiBuilder.append(", ");
+ }
+ wikiBuilder.append(positionalArgs.get(i));
+ }
+ } else if (ignoreTemplates.contains(name)) {
+ // Do nothing.
+
+ } else if (name.equals("initialism")) {
+ assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+ wikiBuilder.append("Initialism");
+ } else if (name.equals("abbreviation")) {
+ assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+ wikiBuilder.append("Abbreviation");
+ } else if (name.equals("acronym")) {
+ assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
+ wikiBuilder.append("Acronym");
+ } else {
+ if (currentTranslationSense != null) {
+ System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs);
+ }
+ }
+ }
+
+ @Override
+ public void onText(String text) {
+ if (wikiBuilder != null) {
+ wikiBuilder.append(text);
+ return;
+ }
+ }
+
+ @Override
+ public void onHeadingStart(int depth) {
+ wikiBuilder = new StringBuilder();
+ currentDepth = depth;
+ if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+ currentPartOfSpeech = null;
+ insidePartOfSpeech = false;
+ }
+ if (currentWord != null && depth <= currentWord.depth) {
+ currentWord = null;
+ }
+
+ currentHeading = null;
+ }
+
+ @Override
+ public void onHeadingEnd(int depth) {
+ final String name = wikiBuilder.toString().trim();
+ wikiBuilder = null;
+ currentTranslationSense = null;
+ currentHeading = name;
+
+ final boolean lang0 = langPatterns[0].matcher(name).matches();
+ final boolean lang1 = langPatterns[1].matcher(name).matches();
+ if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) {
+ currentWord = new WikiWord(title, depth);
+ if (lang0 && lang1) {
+ System.err.println("Word is indexed in both index1 and index2: " + title);
+ }
+ currentWord.language = name;
+ currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1);
+ words.add(currentWord);
+ return;
+ }
+
+ if (currentWord == null) {
+ return;
+ }
+
+ if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) {
+ currentPartOfSpeech = null;
+ }
+
+ insidePartOfSpeech = false;
+ if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) {
+ currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
+ currentWord.partsOfSpeech.add(currentPartOfSpeech);
+ insidePartOfSpeech = true;
+ return;
+ }
+
+ if (name.equals("Translations")) {
+ if (currentWord == null ||
+ !currentWord.language.equals("English") ||
+ currentPartOfSpeech == null) {
+ System.err.println("Unexpected Translations section: " + title);
+ return;
+ }
+ currentTranslationSense = new WikiWord.TranslationSense();
+ }
+
+ }
+
+ @Override
+ public void onListItemStart(String header, int[] section) {
+ wikiBuilder = new StringBuilder();
+ if (currentWord != null) {
+ currentWord.currentPronunciation = null;
+ }
+ }
+
+
+ @Override
+ public void onListItemEnd(String header, int[] section) {
+ String item = wikiBuilder.toString().trim();
+ if (item.length() == 0) {
+ return;
+ }
+ item = WikiParser.simpleParse(item);
+ wikiBuilder = null;
+
+ // Part of speech
+ if (insidePartOfSpeech) {
+ assert currentPartOfSpeech != null : title + item;
+ if (header.equals("#") ||
+ header.equals("##") ||
+ header.equals("###") ||
+ header.equals("####") ||
+ header.equals(":#") ||
+ header.equals("::") ||
+ header.equals(":::*")) {
+ // Definition.
+ // :: should append, probably.
+ currentPartOfSpeech.newMeaning().meaning = item;
+
+ // Source
+ } else if (header.equals("#*") ||
+ header.equals("##*") ||
+ header.equals("###*")) {
+ currentPartOfSpeech.lastMeaning().newExample().source = item;
+
+ // Example
+ } else if (header.equals("#:") ||
+ header.equals("#*:") ||
+ header.equals("#:*") ||
+ header.equals("##:") ||
+ header.equals("##*:") ||
+ header.equals("#:*:") ||
+ header.equals("#:*#") ||
+ header.equals("#*:") ||
+ header.equals("*:") ||
+ header.equals("#:::") ||
+ header.equals("#**") ||
+ header.equals("#*:::") ||
+ header.equals("#:#") ||
+ header.equals(":::") ||
+ header.equals("##:*") ||
+ header.equals("###*:")) {
+ StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item);
+
+ // Example in English
+ } else if (header.equals("#::") ||
+ header.equals("#*::") ||
+ header.equals("#:**") ||
+ header.equals("#*#") ||
+ header.equals("##*::")) {
+ StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item);
+
+ // Skip
+ } else if (header.equals("*") ||
+ header.equals("**") ||
+ header.equals("***") ||
+ header.equals("*#") ||
+ header.equals(":") ||
+ header.equals("::*") ||
+ header.equals("#**") ||
+ header.equals(":*") ||
+ header.equals("#*:*") ||
+ header.equals("#*:**") ||
+ header.equals("#*:#") ||
+ header.equals("#*:*:") ||
+ header.equals("#*:*") ||
+ header.equals(";")) {
+ // might have: * {{seeCites}}
+ // * [[w:Arabic numerals|Arabic numerals]]: 2
+ //assert item.trim().length() == 0;
+ System.err.println("Skipping meaning: " + header + " " + item);
+ } else {
+ if (title.equals("Yellowknife")) {
+ return;
+ }
+ System.err.println("Busted heading: " + title + " "+ header + " " + item);
+ }
+ return;
+ }
+ // Part of speech
+
+ // Translation
+ if (currentTranslationSense != null) {
+ if (item.indexOf("{{[trreq]{}}}") != -1) {
+ return;
+ }
+
+ if (currentPartOfSpeech.translationSenses.isEmpty()) {
+ currentPartOfSpeech.translationSenses.add(currentTranslationSense);
+ }
+
+ final int colonPos = item.indexOf(':');
+ if (colonPos == -1) {
+ System.err.println("Invalid translation: title=" + title + ", item=" + item);
+ return;
+ }
+ final String lang = item.substring(0, colonPos);
+ final String trans = item.substring(colonPos + 1).trim();
+ for (int i = 0; i < 2; ++i) {
+ if (langPatterns[i].matcher(lang).find()) {
+ currentTranslationSense.translations.get(i).add(new Translation(lang, trans));
+ }
+ }
+ } // Translation
+ }
+
+ @Override
+ public void onNewLine() {
+ }
+
+ @Override
+ public void onNewParagraph() {
+ }
+
+ // ----------------------------------------------------------------------
+
+ @Override
+ public void onComment(String text) {
+ }
+
+ @Override
+ public void onFormatBold(boolean boldOn) {
+ }
+
+ @Override
+ public void onFormatItalic(boolean italicOn) {
+ }
+
+ @Override
+ public void onUnterminated(String start, String rest) {
+ System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
+ }
+ @Override
+ public void onInvalidHeaderEnd(String rest) {
+ throw new RuntimeException(rest);
+ }
+
+}
public class WikiHeading {
public final int depth;
public final String name;
+ public final String prefix;
- public WikiHeading(int depth, String name) {
+ public WikiHeading(int depth, String name, String prefix) {
this.depth = depth;
this.name = name;
+ this.prefix = prefix;
}
public static WikiHeading getHeading(String line) {
System.err.println("Invalid heading: " + line);
return null;
}
- return new WikiHeading(i, line.substring(i, line.length() - i).trim());
+ return new WikiHeading(i, line.substring(i, line.length() - i).trim(), prefix);
}
}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public final class WikiTokenizer {
+
+ //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE);
+ private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|<!--|''|$)", Pattern.MULTILINE);
+ private static final String listChars = "*#:;";
+
+
+ final String wikiText;
+ final Matcher matcher;
+
+ boolean justReturnedNewline = true;
+ int end = 0;
+ int start = -1;
+
+ public String header;
+ public int headerDepth;
+
+ final List<String> tokenStack = new ArrayList<String>();
+
+ public WikiTokenizer(final String wikiText) {
+ this.wikiText = wikiText;
+ this.matcher = wikiTokenEvent.matcher(wikiText);
+ }
+
+ private void clear() {
+ header = null;
+ headerDepth = 0;
+ tokenStack.clear();
+ }
+
+
+ public WikiTokenizer nextToken() {
+ this.clear();
+
+ start = end;
+
+ final int len = wikiText.length();
+ if (start >= len) {
+ return null;
+ }
+
+ // Eat a newline if we're looking at one:
+ final boolean atNewline = wikiText.charAt(end) == '\n';
+ if (atNewline) {
+ justReturnedNewline = true;
+ ++end;
+ return this;
+ }
+
+ if (justReturnedNewline) {
+ final char firstChar = wikiText.charAt(end);
+ if (firstChar == '=') {
+ final int headerStart = end;
+ while (++end < len && wikiText.charAt(end) == '=') {}
+ final int headerTitleStart = end;
+ while (++end < len && wikiText.charAt(end) != '=' && wikiText.charAt(end) != '\n') {}
+ final int headerTitleEnd = end;
+ while (++end < len && wikiText.charAt(end) == '=') {}
+ final int headerEnd = end;
+
+ return this;
+ }
+ if (listChars.indexOf(firstChar) != -1) {
+ while (++end < len && listChars.indexOf(wikiText.charAt(end)) != -1) {}
+ end = escapedFind(start, "\n");
+ return this;
+ }
+ }
+ justReturnedNewline = false;
+
+ if (wikiText.startsWith("'''", start)) {
+ end = start + 3;
+ return this;
+ }
+
+ if (wikiText.startsWith("''", start)) {
+ end = start + 2;
+ return this;
+ }
+
+ if (wikiText.startsWith("[[", start)) {
+ end = escapedFind(start + 2, "]]");
+ return this;
+ }
+
+ if (wikiText.startsWith("{{", start)) {
+ end = escapedFind(start + 2, "}}");
+ return this;
+ }
+
+ if (wikiText.startsWith("<pre>", start)) {
+ end = safeIndexOf(wikiText, start, "</pre>", "\n");
+ return this;
+ }
+
+ if (wikiText.startsWith("<math>", start)) {
+ end = safeIndexOf(wikiText, start, "</math>", "\n");
+ return this;
+ }
+
+ if (wikiText.startsWith("<!--", start)) {
+ end = safeIndexOf(wikiText, start, "-->", "\n");
+ return this;
+ }
+
+ if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) {
+ System.err.println("Close without open!");
+ end += 2;
+ return this;
+ }
+
+
+ if (this.matcher.find(start)) {
+ end = this.matcher.start(1);
+ if (end == start) {
+ System.err.println(this.matcher.group());
+ assert false;
+ }
+ return this;
+ }
+
+ end = wikiText.length();
+ return this;
+
+ }
+
+ public String token() {
+ return wikiText.substring(start, end);
+ }
+
+ private int escapedFind(final int start, final String toFind) {
+ assert tokenStack.isEmpty();
+
+ int end = start;
+ while (end < wikiText.length()) {
+ if (matcher.find(end)) {
+ final String matchText = matcher.group();
+ final int matchStart = matcher.start();
+
+ if (matchText.length() == 0) {
+ assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n';
+ if (tokenStack.isEmpty() && toFind.equals("\n")) {
+ return matchStart;
+ }
+ ++end;
+ } else if (tokenStack.isEmpty() && matchText.equals(toFind)) {
+ // The normal return....
+ return matcher.end();
+ } else if (matchText.equals("[[") || matchText.equals("{{")) {
+ tokenStack.add(matchText);
+ } else if (matchText.equals("]]") || matchText.equals("}}")) {
+ if (tokenStack.size() > 0) {
+ final String removed = tokenStack.remove(tokenStack.size() - 1);
+ if (removed.equals("{{") && !matcher.group().equals("}}")) {
+ System.err.println("Unmatched {{ error: " + wikiText.substring(start));
+ return safeIndexOf(wikiText, start, "\n", "\n");
+ } else if (removed.equals("[[") && !matcher.group().equals("]]")) {
+ System.err.println("Unmatched [[ error: " + wikiText.substring(start));
+ return safeIndexOf(wikiText, start, "\n", "\n");
+ }
+ } else {
+ System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n"));
+ // If we were looking for a newline
+ return safeIndexOf(wikiText, start, "\n", "\n");
+ }
+ } else if (matchText.equals("<!--")) {
+ end = wikiText.indexOf("-->");
+ if (end == -1) {
+ System.err.println("Unmatched <!-- error: " + wikiText.substring(start));
+ }
+ } else {
+ assert false : "Match text='" + matchText + "'";
+ throw new IllegalStateException();
+ }
+ } else {
+ // Hmmm, we didn't find the closing symbol we were looking for...
+ System.err.println("Couldn't find: " + toFind + ", "+ wikiText.substring(start));
+ return safeIndexOf(wikiText, start, "\n", "\n");
+ }
+
+ // Inside the while loop.
+ end = Math.max(end, matcher.end());
+ }
+ return end;
+ }
+
+ static int safeIndexOf(final String s, final int start, final String target, final String backup) {
+ int close = s.indexOf(target, start);
+ if (close != -1) {
+ return close + target.length();
+ }
+ close = s.indexOf(backup, start);
+ if (close != -1) {
+ return close + backup.length();
+ }
+ return s.length();
+ }
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+public class WikiTokenizerTest extends TestCase {
+
+ public void testSimple() {
+ final String wikiText =
+ "Hi" + "\n" +
+ "Hello thad you're <!-- not --> '''pretty''' cool '''''over''''' there." + "\n" +
+ "hi <!--" + "\n" +
+ "multi-line" + "\n" +
+ "# comment -->" + "\n" +
+ "" + "\n" +
+ "asdf\n" +
+ "{{template_not_in_list}}" + "\n" +
+ "# {{template_in_list}}" + "\n" +
+ "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list
+ ": but this is a list!" + "\n" +
+ "*:* and so is this :::" + "\n" +
+ "here's [[some blah|some]] wikitext." + "\n" +
+ "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
+ "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
+ "== Header 2 ==" + "\n" +
+ "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
+ "{{mismatched]]" + "\n" +
+ "[[mismatched}}" + "\n" +
+ "{extraterminated}}" + "\n" +
+ "[extraterminated]]" + "\n" +
+ "=== {{header-template}} ===" + "\n";
+
+ final String[] expectedTokens = new String[] {
+ "Hi",
+ "\n",
+ "Hello thad you're ",
+ "<!-- not -->",
+ " ",
+ "'''",
+ "pretty",
+ "'''",
+ " cool ",
+ "'''",
+ "''",
+ "over",
+ "'''",
+ "''",
+ " there.",
+ "\n",
+ "hi ",
+ "<!--\nmulti-line\n# comment -->",
+ "\n",
+ "\n",
+ "asdf",
+ "\n",
+ "{{template_not_in_list}}",
+ "\n",
+ "# {{template_in_list}}",
+ "\n",
+ "[[wikitext]]",
+ ":",
+ "[[wikitext]]",
+ "\n",
+ ": but this is a list!",
+ "\n",
+ "*:* and so is this :::",
+ "\n",
+ "here's ",
+ "[[some blah|some]]",
+ " wikitext.",
+ "\n",
+ "here's a ",
+ "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}",
+ " and some more text.",
+ "\n",
+ "== Header 2 ==",
+ "\n",
+ "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}",
+ "\n",
+ "{{mismatched]]\n",
+ "[[mismatched}}\n",
+ "{extraterminated",
+ "}}",
+ "\n",
+ "[extraterminated",
+ "]]",
+ "\n",
+ "=== {{header-template}} ===",
+ "\n",
+ };
+
+ final List<String> actualTokens = new ArrayList<String>();
+
+ final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText);
+ WikiTokenizer token;
+ int i = 0;
+ while ((token = wikiTokenizer.nextToken()) != null) {
+ actualTokens.add(token.token());
+ System.out.println("\"" + token.token().replace("\n", "\\n") + "\",");
+ assertEquals(expectedTokens[i++], token.token());
+ }
+ assertEquals(Arrays.asList(expectedTokens), actualTokens);
+ }
+
+ public void testWikiHeading() {
+ assertNull(WikiHeading.getHeading(""));
+ assertNull(WikiHeading.getHeading("="));
+ assertNull(WikiHeading.getHeading("=="));
+ assertNull(WikiHeading.getHeading("=a"));
+ assertNull(WikiHeading.getHeading("=a=="));
+ assertNull(WikiHeading.getHeading("===a=="));
+ assertNull(WikiHeading.getHeading("===a===="));
+ assertNull(WikiHeading.getHeading("a="));
+ assertEquals("a", WikiHeading.getHeading("=a=").name);
+ assertEquals(1, WikiHeading.getHeading("=a=").depth);
+ assertEquals("aa", WikiHeading.getHeading("==aa==").name);
+ assertEquals(2, WikiHeading.getHeading("==aa==").depth);
+ }
+
+
+ public void testWikiFunction() {
+ assertNull(WikiFunction.getFunction(""));
+ assertNull(WikiFunction.getFunction("[[asdf]]"));
+ assertNull(WikiFunction.getFunction("asd [[asdf]]asdf "));
+ assertEquals("a", WikiFunction.getFunction("{{a}}").name);
+ assertEquals("a", WikiFunction.getFunction("{{a|b}}").name);
+ assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name);
+ assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name);
+ assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name);
+ assertEquals(Arrays.asList("b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args);
+ assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b"));
+ assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer"));
+ }
+
+}
--- /dev/null
+package com.hughes.android.dictionary.parser;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.hughes.android.dictionary.engine.DictionaryBuilder;
+import com.hughes.android.dictionary.engine.IndexedEntry;
+import com.hughes.android.dictionary.engine.EntryTypeName;
+import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.engine.PairEntry;
+import com.hughes.android.dictionary.engine.PairEntry.Pair;
+import com.hughes.util.ListUtil;
+
+public class WikiWord {
+ final int depth;
+
+ final String title;
+ String language;
+
+ int index;
+
+ final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
+ StringBuilder currentPronunciation = null;
+
+ final List<PartOfSpeech> partsOfSpeech = new ArrayList<WikiWord.PartOfSpeech>();
+
+ public WikiWord(final String title, int depth) {
+ this.title = title.intern();
+ this.depth = depth;
+ }
+
+ static class PartOfSpeech {
+ final int depth;
+ final String name;
+
+ final List<Meaning> meanings = new ArrayList<WikiWord.Meaning>();
+
+ final List<TranslationSense> translationSenses = new ArrayList<WikiWord.TranslationSense>();
+
+ final List<FormOf> formOfs = new ArrayList<WikiWord.FormOf>();
+
+ public PartOfSpeech(final int depth, String name) {
+ this.depth = depth;
+ this.name = name.intern();
+ }
+
+ public Meaning newMeaning() {
+ final Meaning meaning = new Meaning();
+ meanings.add(meaning);
+ return meaning;
+ }
+
+ public Meaning lastMeaning() {
+ return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings);
+ }
+ }
+
+ static class TranslationSense {
+ String sense;
+ List<List<Translation>> translations = new ArrayList<List<Translation>>();
+ {
+ translations.add(new ArrayList<Translation>());
+ translations.add(new ArrayList<Translation>());
+ }
+ }
+
+ static class Translation {
+ String language;
+ String text;
+
+ public Translation(final String language, final String text) {
+ this.language = language;
+ this.text = text;
+ }
+
+ @Override
+ public String toString() {
+ return language + ": " + text;
+ }
+ }
+
+ static class FormOf {
+ final String grammarForm;
+ final String target;
+
+ public FormOf(final String grammarForm, final String token) {
+ this.grammarForm = grammarForm;
+ this.target = token;
+ }
+ }
+
+ static class Meaning {
+ String meaning;
+ final List<Example> examples = new ArrayList<WikiWord.Example>();
+
+ public Example newExample() {
+ final Example example = new Example();
+ this.examples.add(example);
+ return example;
+ }
+
+ public Example lastExample() {
+ return examples.isEmpty() ? newExample() : ListUtil.getLast(examples);
+ }
+ }
+
+ static class Example {
+ String source;
+ final StringBuilder example = new StringBuilder();
+ final StringBuilder exampleInEnglish = new StringBuilder();
+ }
+
+ // -------------------------------------------------------------------------
+
+ void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) {
+ //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation);
+ if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) {
+ System.err.println("Word with no POS: " + title);
+ }
+ for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) {
+ partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
+ } // PartOfSpeech
+
+ // Pronunciation.
+ if (index != -1) {
+ final PairEntry pronEntry = new PairEntry();
+ for (final Map.Entry<String, StringBuilder> accentToPron : accentToPronunciation.entrySet()) {
+ String accent = accentToPron.getKey();
+ if (accent.length() > 0) {
+ accent = accent + ": ";
+ }
+ pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0));
+ }
+ if (pronEntry.pairs.size() > 0) {
+ final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry);
+ dictBuilder.dictionary.pairEntries.add(pronEntry);
+ final Set<String> tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
+ dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION);
+ }
+ }
+ }
+
+
+ static final Pattern templateName = Pattern.compile("\\{[^,]*,");
+ private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder,
+ final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
+ //System.out.println(" pos: " + partOfSpeech.name);
+
+ for (final WikiWord.Meaning meaning : partOfSpeech.meanings) {
+ //System.out.println(" meaning: " + meaning.meaning);
+ for (final WikiWord.Example example : meaning.examples) {
+ if (example.example.length() > 0) {
+ //System.out.println(" example: " + example.example);
+ }
+ if (example.exampleInEnglish.length() > 0) {
+ //System.out.println(" exampleInEnglish: " + example.exampleInEnglish);
+ }
+ }
+ }
+
+ if (index != -1) {
+ final boolean formOfSwap = index != 0;
+ for (final FormOf formOf : partOfSpeech.formOfs) {
+ final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap);
+ final PairEntry pairEntry = new PairEntry();
+ pairEntry.pairs.add(pair);
+ final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+ dictBuilder.dictionary.pairEntries.add(pairEntry);
+
+ // File under title token.
+ final Set<String> tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR);
+ dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF);
+ }
+ }
+
+
+ if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) {
+ final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")";
+ final boolean swap = enIndexBuilder == 1;
+
+ // Meanings.
+ for (final Meaning meaning : partOfSpeech.meanings) {
+ final PairEntry pairEntry = new PairEntry();
+ final List<Pair> pairs = pairEntry.pairs;
+
+ final List<Set<String>> exampleTokens = new ArrayList<Set<String>>();
+ exampleTokens.add(new LinkedHashSet<String>());
+ exampleTokens.add(new LinkedHashSet<String>());
+
+ if (meaning.meaning != null && meaning.meaning.length() > 0) {
+ final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap);
+ pairs.add(meaningPair);
+ } else {
+ System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name);
+ }
+
+ // Examples
+ for (final Example example : meaning.examples) {
+ final int dashIndex = example.example.indexOf("—");
+ if (example.exampleInEnglish.length() == 0 && dashIndex != -1) {
+ System.out.println("Splitting example: title=" + title + ", "+ example.example);
+ example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim());
+ example.example.delete(dashIndex, example.example.length());
+ }
+
+ if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) {
+ final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap);
+ pairs.add(pair);
+
+ for (int i = 0; i < 2; ++i) {
+ exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR));
+ }
+ }
+ }
+
+ // Create EntryData with the PairEntry.
+ final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+ dictBuilder.dictionary.pairEntries.add(pairEntry);
+
+ // File under title token.
+ final Set<String> titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR);
+ dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD);
+
+ // File under the meaning tokens (English):
+ if (meaning.meaning != null) {
+ // If the meaning contains any templates, strip out the template name
+ // so we don't index it.
+ final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll("");
+ final Set<String> meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR);
+ dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD);
+ }
+
+ // File under other tokens that we saw.
+ for (int i = 0; i < 2; ++i) {
+ dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS);
+ }
+
+
+ } // Meanings.
+
+ }
+
+ translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech);
+ }
+
+
+ private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder,
+ final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) {
+ if (!partOfSpeech.translationSenses.isEmpty()) {
+ if (!language.equals("English")) {
+ System.err.println("Translation sections not in English.");
+ }
+
+ final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)";
+
+ for (final TranslationSense translationSense : partOfSpeech.translationSenses) {
+ //System.out.println(" sense: " + translationSense.sense);
+ if (translationSense.sense == null) {
+ //System.err.println(" null sense: " + title);
+ }
+ String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : "");
+
+ final StringBuilder[] sideBuilders = new StringBuilder[2];
+ final List<Map<EntryTypeName, List<String>>> sideTokens = new ArrayList<Map<EntryTypeName,List<String>>>();
+ for (int i = 0; i < 2; ++i) {
+ sideBuilders[i] = new StringBuilder();
+ sideTokens.add(new LinkedHashMap<EntryTypeName, List<String>>());
+ }
+
+ if (enIndexBuilder != -1) {
+ sideBuilders[enIndexBuilder].append(englishSense);
+ addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD);
+ }
+
+ // Get the entries from the translation section.
+ for (int i = 0; i < 2; ++i) {
+ //System.out.println(" lang: " + i);
+ for (final Translation translation : translationSense.translations.get(i)) {
+ //System.out.println(" translation: " + translation);
+ sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : "");
+ if (translationSense.translations.get(i).size() > 1) {
+ sideBuilders[i].append(translation.language).append(": ");
+ }
+ sideBuilders[i].append(translation.text);
+
+ // TODO: Don't index {m}, {f}
+ // TODO: Don't even show: (1), (1-2), etc.
+ addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD);
+ }
+ }
+
+ // Construct the Translations-based QuickDic entry for this TranslationSense.
+ if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) {
+ final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString());
+ final PairEntry pairEntry = new PairEntry();
+ pairEntry.pairs.add(pair);
+ final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry);
+ dictBuilder.dictionary.pairEntries.add(pairEntry);
+
+ // Add the EntryData to the indices under the correct tokens.
+ for (int i = 0; i < 2; ++i) {
+ final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i);
+ for (final Map.Entry<EntryTypeName, List<String>> entry : sideTokens.get(i).entrySet()) {
+ for (final String token : entry.getValue()) {
+ final List<IndexedEntry> entries = indexBuilder.getOrCreateEntries(token, entry.getKey());
+ entries.add(entryData);
+ }
+ }
+
+ }
+
+ }
+ } // Senses
+ } // Translations
+ }
+
+
+ static void addTokens(final String text, final Map<EntryTypeName, List<String>> map,
+ EntryTypeName entryTypeName) {
+ final Set<String> tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR);
+ if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) {
+ entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD;
+ }
+ List<String> tokenList = map.get(entryTypeName);
+ if (tokenList == null) {
+ tokenList = new ArrayList<String>();
+ map.put(entryTypeName, tokenList);
+ }
+ tokenList.addAll(tokens);
+ }
+
+
+
+}