From eeb5667c56b2074b7eeac531589c9f1bf55ba738 Mon Sep 17 00:00:00 2001 From: Thad Hughes Date: Mon, 21 Nov 2011 20:33:50 -0800 Subject: [PATCH] go --- .../dictionary/engine/DictionaryBuilder.java | 3 +- .../engine/DictionaryBuilderMain.java | 26 +- .../engine/DictionaryBuilderTest.java | 5 +- .../dictionary/engine/IndexBuilder.java | 20 +- .../dictionary/engine/IndexedEntry.java | 19 + .../{ => engine}/WiktionarySplitter.java | 2 +- .../dictionary/parser/DictFileParser.java | 2 +- .../parser/EnWiktionaryXmlParser.java | 225 ++++-- .../parser/EnWiktionaryXmlParser.java.old | 647 ++++++++++++++++++ .../dictionary/parser/WikiHeading.java | 6 +- .../dictionary/parser/WikiTokenizer.java | 206 ++++++ .../dictionary/parser/WikiTokenizerTest.java | 138 ++++ .../dictionary/parser/WikiWord.java.old | 339 +++++++++ 13 files changed, 1539 insertions(+), 99 deletions(-) create mode 100644 src/com/hughes/android/dictionary/engine/IndexedEntry.java rename src/com/hughes/android/dictionary/{ => engine}/WiktionarySplitter.java (99%) create mode 100644 src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old create mode 100644 src/com/hughes/android/dictionary/parser/WikiTokenizer.java create mode 100644 src/com/hughes/android/dictionary/parser/WikiTokenizerTest.java create mode 100644 src/com/hughes/android/dictionary/parser/WikiWord.java.old diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java index ad80994..1be2af0 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilder.java @@ -137,7 +137,8 @@ public class DictionaryBuilder { if (enIndex < 0 || enIndex >= 2) { fatalError("Must be 1 or 2: " + prefix + "EnIndex"); } - new EnWiktionaryXmlParser(dictionaryBuilder, langPattern, langCodePattern, enIndex).parse(file, Integer.parseInt(pageLimit)); + new EnWiktionaryXmlParser(dictionaryBuilder.indexBuilders.get(enIndex), dictionaryBuilder.indexBuilders.get(1-enIndex), + langPattern, langCodePattern, enIndex != 0).parse(file, Integer.parseInt(pageLimit)); } else { fatalError("Invalid or missing input format: " + inputFormat); } diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index c49305e..17190a9 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -22,22 +22,10 @@ public class DictionaryBuilderMain extends TestCase { public static void main(final String[] args) throws Exception { - DictionaryBuilder.main(new String[] { - "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic", - "--lang1=DE", - "--lang2=EN", - "--dictInfo=@dictInputs/de-en_chemnitz.info", - - "--input1=dictInputs/de-en_chemnitz.txt", - "--input1Name=chemnitz", - "--input1Charset=UTF8", - "--input1Format=chemnitz", - }); - Lang[] langs1 = new Lang[] { new Lang("^English$", "EN"), - new Lang("^German$", "DE"), + //new Lang("^German$", "DE"), }; Lang[] langs2 = new Lang[] { new Lang("^Italian$", "IT"), @@ -131,6 +119,18 @@ public class DictionaryBuilderMain extends TestCase { } // langs2 } // langs1 + DictionaryBuilder.main(new String[] { + "--dictOut=dictOutputs/DE-EN_chemnitz.quickdic", + "--lang1=DE", + "--lang2=EN", + "--dictInfo=@dictInputs/de-en_chemnitz.info", + + "--input1=dictInputs/de-en_chemnitz.txt", + "--input1Name=chemnitz", + "--input1Charset=UTF8", + "--input1Format=chemnitz", + }); + DictionaryBuilder.main(new String[] { "--dictOut=dictOutputs/de-en_all.quickdic", "--lang1=DE", diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java index a564b93..693db6c 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java @@ -20,14 +20,15 @@ public class DictionaryBuilderTest extends TestCase { "--lang2=EN", "--dictInfo=SomeWikiData", + /* "--input3=wikiSplit/english.data", - "--input3Name=enwiktionary.italian", + "--input3Name=enwiktionary.english", "--input3Format=enwiktionary", "--input3LangPattern=Italian", "--input3LangCodePattern=it", "--input3EnIndex=2", "--input3PageLimit=1000", - +*/ "--input4=wikiSplit/italian.data", "--input4Name=enwiktionary.italian", "--input4Format=enwiktionary", diff --git a/src/com/hughes/android/dictionary/engine/IndexBuilder.java b/src/com/hughes/android/dictionary/engine/IndexBuilder.java index 172be90..cab3318 100644 --- a/src/com/hughes/android/dictionary/engine/IndexBuilder.java +++ b/src/com/hughes/android/dictionary/engine/IndexBuilder.java @@ -12,6 +12,7 @@ import java.util.SortedMap; import java.util.TreeMap; import com.hughes.android.dictionary.engine.Index.IndexEntry; +import com.hughes.android.dictionary.parser.DictFileParser; public class IndexBuilder { @@ -40,6 +41,10 @@ public class IndexBuilder { // System.out.println("TOKEN: " + tokenData.token); for (final Map.Entry> typeToEntry : tokenData.typeToEntries.entrySet()) { for (final IndexedEntry entryData : typeToEntry.getValue()) { + if (entryData.index() == -1) { + entryData.addToDictionary(dictionaryBuilder.dictionary); + assert entryData.index() >= 0; + } if (tokenEntryDatas.add(entryData)) { rows.add(new PairEntry.Row(entryData.index(), rows.size(), index)); ++numRows; @@ -97,12 +102,21 @@ public class IndexBuilder { return entries; } - public void addEntryWithTokens(final IndexedEntry entryData, final Set tokens, + public void addEntryWithTokens(final IndexedEntry indexedEntry, final Set tokens, final EntryTypeName entryTypeName) { for (final String token : tokens) { - getOrCreateEntries(token, entryTypeName).add(entryData); + getOrCreateEntries(token, entryTypeName).add(indexedEntry); } } - + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName singleTokenEntryTypeName, final EntryTypeName multiTokenEntryTypeName) { + final Set tokens = DictFileParser.tokenize(untokenizedString, DictFileParser.NON_CHAR); + addEntryWithTokens(indexedEntry, tokens, tokens.size() == 1 ? singleTokenEntryTypeName : multiTokenEntryTypeName); + } + + public void addEntryWithString(final IndexedEntry indexedEntry, final String untokenizedString, + final EntryTypeName entryTypeName) { + addEntryWithString(indexedEntry, untokenizedString, entryTypeName, entryTypeName); + } } diff --git a/src/com/hughes/android/dictionary/engine/IndexedEntry.java b/src/com/hughes/android/dictionary/engine/IndexedEntry.java new file mode 100644 index 0000000..dedb679 --- /dev/null +++ b/src/com/hughes/android/dictionary/engine/IndexedEntry.java @@ -0,0 +1,19 @@ +/** + * + */ +package com.hughes.android.dictionary.engine; + +import com.hughes.util.IndexedObject; + +public class IndexedEntry extends IndexedObject { + public IndexedEntry(final AbstractEntry entry) { + super(-1); + this.entry = entry; + } + AbstractEntry entry; + + public void addToDictionary(Dictionary dictionary) { + assert index == -1; + index = entry.addToDictionary(dictionary); + } +} \ No newline at end of file diff --git a/src/com/hughes/android/dictionary/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java similarity index 99% rename from src/com/hughes/android/dictionary/WiktionarySplitter.java rename to src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 89173c5..685b238 100644 --- a/src/com/hughes/android/dictionary/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -1,4 +1,4 @@ -package com.hughes.android.dictionary; +package com.hughes.android.dictionary.engine; import java.io.BufferedOutputStream; import java.io.DataOutputStream; diff --git a/src/com/hughes/android/dictionary/parser/DictFileParser.java b/src/com/hughes/android/dictionary/parser/DictFileParser.java index 909a6b7..20611ae 100644 --- a/src/com/hughes/android/dictionary/parser/DictFileParser.java +++ b/src/com/hughes/android/dictionary/parser/DictFileParser.java @@ -42,7 +42,7 @@ public class DictFileParser { static final Pattern CURLY_BRACED = Pattern.compile("\\{([^}]+)\\}"); static final Pattern NON_CHAR_DASH = Pattern.compile("[^-'\\p{L}0-9]+"); - static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+"); + public static final Pattern NON_CHAR = Pattern.compile("[^\\p{L}0-9]+"); static final Pattern TRIM_PUNC = Pattern.compile("^[^\\p{L}0-9]+|[^\\p{L}0-9]+$"); diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java index cfdf0f1..95e910d 100644 --- a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java @@ -12,7 +12,11 @@ import java.util.Set; import java.util.regex.Pattern; import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.android.dictionary.engine.PairEntry.Pair; public class EnWiktionaryXmlParser { @@ -29,19 +33,18 @@ public class EnWiktionaryXmlParser { "Particle|Interjection|Pronominal adverb" + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); - final DictionaryBuilder dictBuilder; - - final IndexBuilder[] indexBuilders; + final IndexBuilder enIndexBuilder; + final IndexBuilder otherIndexBuilder; final Pattern langPattern; final Pattern langCodePattern; - final int enIndexBuilder; + final boolean swap; - public EnWiktionaryXmlParser(final DictionaryBuilder dictBuilder, final Pattern langPattern, final Pattern langCodePattern, final int enIndexBuilder) { - this.dictBuilder = dictBuilder; - this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]); + public EnWiktionaryXmlParser(final IndexBuilder enIndexBuilder, final IndexBuilder otherIndexBuilder, final Pattern langPattern, final Pattern langCodePattern, final boolean swap) { + this.enIndexBuilder = enIndexBuilder; + this.otherIndexBuilder = otherIndexBuilder; this.langPattern = langPattern; this.langCodePattern = langCodePattern; - this.enIndexBuilder = enIndexBuilder; + this.swap = swap; } @@ -92,7 +95,7 @@ public class EnWiktionaryXmlParser { if (heading.replaceAll("=", "").equals("English")) { doEnglishWord(title, text); } else { - //doForeignWord(title, text); + doForeignWord(title, text); } } // endPage() @@ -156,8 +159,8 @@ public class EnWiktionaryXmlParser { while ((wikiFunction = WikiFunction.getFunction(line)) != null) { if (wikiFunction.name.equals("trans-top")) { sense = null; - if (wikiFunction.args.size() >= 2) { - sense = wikiFunction.args.get(1); + if (wikiFunction.args.size() >= 1) { + sense = wikiFunction.args.get(0); //System.out.println("Sense: " + sense); } } else if (wikiFunction.name.equals("trans-bottom")) { @@ -182,75 +185,15 @@ public class EnWiktionaryXmlParser { if (colonIndex == -1) { continue; } + final String lang = line.substring(0, colonIndex); if (!this.langPattern.matcher(lang).find()) { continue; } - String rest = line.substring(colonIndex + 1); - final StringBuilder lineText = new StringBuilder(); + String rest = line.substring(colonIndex + 1).trim(); + doTranslationLine(line, title, sense, rest); - boolean ttbc = false; - WikiFunction wikiFunction; - while ((wikiFunction = WikiFunction.getFunction(line)) != null) { - if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) { - if (wikiFunction.args.size() < 2) { - System.err.println("{{t}} with too few args: " + line + ", title=" + title); - continue; - } - final String langCode = wikiFunction.getArg(0); - if (this.langCodePattern.matcher(langCode).matches()) { - final String word = wikiFunction.getArg(1); - final String gender = wikiFunction.getArg(2); - final String transliteration = wikiFunction.getNamedArg("tr"); - } - } else if (wikiFunction.name.equals("qualifier")) { - String qualifier = wikiFunction.getArg(0); - } else if (encodings.contains(wikiFunction.name)) { - rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0)); - wikiFunction = null; - } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) { - String gender = wikiFunction.name; - for (int i = 0; i < wikiFunction.args.size(); ++i) { - gender += "|" + wikiFunction.getArg(i); - } - rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}"); - wikiFunction = null; - } else if (wikiFunction.name.equals("g")) { - rest = wikiFunction.replaceWith(rest, "{g}"); - wikiFunction = null; - } else if (wikiFunction.name.equals("l")) { - // encodes text in various langs. - rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(1)); - // TODO: transliteration - wikiFunction = null; - } else if (wikiFunction.name.equals("term")) { - // cross-reference to another dictionary - rest = wikiFunction.replaceWith(rest, wikiFunction.getArg(0)); - // TODO: transliteration - wikiFunction = null; - } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) { - // TODO: put this text aside to use it. - rest = wikiFunction.replaceWith(rest, "[" + wikiFunction.getArg(0) + "]"); - wikiFunction = null; - } else if (wikiFunction.name.equals("ttbc")) { - ttbc = true; - } else if (wikiFunction.name.equals("trreq")) { - } else if (wikiFunction.name.equals("not used")) { - rest = wikiFunction.replaceWith(rest, "[not used]"); - wikiFunction = null; - } else if (wikiFunction.name.equals("t-image")) { - // American sign language - } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) { - rest = wikiFunction.replaceWith(rest, "{" + wikiFunction.name + "}"); - wikiFunction = null; - } else { - System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title); - } - if (wikiFunction != null) { - rest = wikiFunction.replaceWith(rest, ""); - } - } } else if (line.equals("")) { } else if (line.startsWith(":")) { } else if (line.startsWith("[[") && line.endsWith("]]")) { @@ -265,6 +208,118 @@ public class EnWiktionaryXmlParser { } + private void doTranslationLine(final String line, final String title, final String sense, String rest) { + + // Good chance we'll actually file this one... + final PairEntry pairEntry = new PairEntry(); + final IndexedEntry indexedEntry = new IndexedEntry(pairEntry); + + final StringBuilder otherText = new StringBuilder(); + + WikiFunction wikiFunction; + while ((wikiFunction = WikiFunction.getFunction(rest)) != null) { + if (wikiFunction.start > 0) { + String plainText = rest.substring(0, wikiFunction.start); + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT); + } + rest = rest.substring(wikiFunction.end); + + if (wikiFunction.name.equals("t") || wikiFunction.name.equals("t+") || wikiFunction.name.equals("t-") || wikiFunction.name.equals("tø")) { + if (wikiFunction.args.size() < 2) { + System.err.println("{{t}} with too few args: " + line + ", title=" + title); + continue; + } + final String langCode = wikiFunction.getArg(0); + if (this.langCodePattern.matcher(langCode).matches()) { + final String word = wikiFunction.getArg(1); + final String gender = wikiFunction.getArg(2); + final String transliteration = wikiFunction.getNamedArg("tr"); + if (otherText.length() > 0) { + otherText.append(""); + } + otherText.append(word); + otherIndexBuilder.addEntryWithString(indexedEntry, word, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + if (gender != null) { + otherText.append(String.format(" {%s}", gender)); + } + if (transliteration != null) { + otherText.append(String.format(" (tr. %s)", transliteration)); + otherIndexBuilder.addEntryWithString(indexedEntry, transliteration, EntryTypeName.WIKTIONARY_TRANSLITERATION); + } + } + } else if (wikiFunction.name.equals("qualifier")) { + String qualifier = wikiFunction.getArg(0); + if (!wikiFunction.namedArgs.isEmpty() || wikiFunction.args.size() > 1) { + System.err.println("weird qualifier: " + line); + } + otherText.append("(").append(qualifier).append(")"); + } else if (encodings.contains(wikiFunction.name)) { + otherText.append("").append(wikiFunction.getArg(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); + } else if (wikiFunction.name.equals("m") || wikiFunction.name.equals("f") || wikiFunction.name.equals("n")) { + otherText.append("{"); + otherText.append(wikiFunction.name); + for (int i = 0; i < wikiFunction.args.size(); ++i) { + otherText.append("|").append(wikiFunction.getArg(i)); + } + otherText.append("}"); + } else if (wikiFunction.name.equals("g")) { + otherText.append("{g}"); + } else if (wikiFunction.name.equals("l")) { + // encodes text in various langs. + // lang is arg 0. + otherText.append("").append(wikiFunction.getArg(1)); + otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(1), EntryTypeName.WIKTIONARY_OTHER_TEXT); + // TODO: transliteration + } else if (wikiFunction.name.equals("term")) { + // cross-reference to another dictionary + otherText.append("").append(wikiFunction.getArg(0)); + otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); + // TODO: transliteration + } else if (wikiFunction.name.equals("italbrac") || wikiFunction.name.equals("gloss")) { + // TODO: put this text aside to use it. + otherText.append("[").append(wikiFunction.getArg(0)).append("]"); + otherIndexBuilder.addEntryWithString(indexedEntry, wikiFunction.getArg(0), EntryTypeName.WIKTIONARY_OTHER_TEXT); + } else if (wikiFunction.name.equals("ttbc")) { + } else if (wikiFunction.name.equals("trreq")) { + } else if (wikiFunction.name.equals("not used")) { + otherText.append("(not used)"); + } else if (wikiFunction.name.equals("t-image")) { + // American sign language + } else if (wikiFunction.args.isEmpty() && wikiFunction.namedArgs.isEmpty()) { + otherText.append("{UNK. FUNC.: ").append(wikiFunction.name).append("}"); + } else { + System.err.println("Unexpected t+- wikifunction: " + line + ", title=" + title); + } + } + String plainText = rest; + otherText.append("").append(plainText); + otherIndexBuilder.addEntryWithString(indexedEntry, plainText, EntryTypeName.WIKTIONARY_OTHER_TEXT); + + StringBuilder englishText = new StringBuilder(); + + englishText.append(title); + if (sense != null) { + englishText.append(" (").append(sense).append(")"); + enIndexBuilder.addEntryWithString(indexedEntry, sense, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE, EntryTypeName.WIKTIONARY_TRANSLATION_SENSE); + } + if (pos != null) { + englishText.append(" (").append(pos.toLowerCase()).append(")"); + } + enIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_SINGLE, EntryTypeName.WIKTIONARY_TITLE_MULTI); + + final Pair pair = new Pair(englishText.toString(), WikiParser.simpleParse(otherText.toString()), swap); + pairEntry.pairs.add(pair); + assert (pairsAdded.add(pair.toString())); + if (pair.toString().equals("libero {m} :: free (adjective)")) { + System.out.println(); + } + + } + + Set pairsAdded = new LinkedHashSet(); + // ------------------------------------------------------------------------- private void doForeignWord(String title, String text) { @@ -273,15 +328,33 @@ public class EnWiktionaryXmlParser { while ((line = wikiLineReader.readLine()) != null) { final WikiHeading wikiHeading = WikiHeading.getHeading(line); if (wikiHeading != null) { - if (wikiHeading.name.equals("Translations")) { System.err.println("Translations not in English section: " + title); } else if (wikiHeading.name.equals("Pronunciation")) { //doPronunciation(wikiLineReader); } else if (partOfSpeechHeader.matcher(wikiHeading.name).matches()) { - + doPartOfSpeech(title, wikiHeading, wikiLineReader); + } + } + } + } + + + private void doPartOfSpeech(String title, final WikiHeading posHeading, WikiLineReader wikiLineReader) { + String line; + System.out.println("***" + title); + System.out.println(posHeading.name); + while ((line = wikiLineReader.readLine()) != null) { + WikiHeading heading = WikiHeading.getHeading(line); + if (heading != null) { + if (heading.depth <= posHeading.depth) { + wikiLineReader.stuffLine(line); + return; } } + System.out.println(line); + + } } diff --git a/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old new file mode 100644 index 0000000..75f2121 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/EnWiktionaryXmlParser.java.old @@ -0,0 +1,647 @@ +package com.hughes.android.dictionary.parser; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.parser.WikiWord.FormOf; +import com.hughes.android.dictionary.parser.WikiWord.Translation; +import com.hughes.util.ListUtil; +import com.hughes.util.StringUtil; + +public class EnWiktionaryXmlParserOld extends org.xml.sax.helpers.DefaultHandler implements WikiCallback { + + static final Pattern partOfSpeechHeader = Pattern.compile( + "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" + + "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" + + "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" + + "Ligature|Idiom|Phrase|" + + // These are @deprecated: + "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" + + "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb|" + + // These are extras I found: + "Determiner|Numeral|Number|Cardinal number|Ordinal number|Proverb|" + + "Particle|Interjection|Pronominal adverb" + + "Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable"); + + static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+"); + + final DictionaryBuilder dictBuilder; + + final IndexBuilder[] indexBuilders; + final Pattern[] langPatterns; + final int enIndexBuilder; + + StringBuilder titleBuilder; + StringBuilder textBuilder; + StringBuilder currentBuilder = null; + + static void assertTrue(final boolean condition) { + assertTrue(condition, ""); + } + + static void assertTrue(final boolean condition, final String message) { + if (!condition) { + System.err.println("Assertion failed, message: " + message); + new RuntimeException().printStackTrace(System.err); + } + } + + public EnWiktionaryXmlParserOld(final DictionaryBuilder dictBuilder, final Pattern[] langPatterns, final int enIndexBuilder) { + assertTrue(langPatterns.length == 2); + this.dictBuilder = dictBuilder; + this.indexBuilders = dictBuilder.indexBuilders.toArray(new IndexBuilder[0]); + this.langPatterns = langPatterns; + this.enIndexBuilder = enIndexBuilder; + } + + @Override + public void startElement(String uri, String localName, String qName, + Attributes attributes) { + currentBuilder = null; + if ("page".equals(qName)) { + titleBuilder = new StringBuilder(); + + // Start with "\n" to better match certain strings. + textBuilder = new StringBuilder("\n"); + } else if ("title".equals(qName)) { + currentBuilder = titleBuilder; + } else if ("text".equals(qName)) { + currentBuilder = textBuilder; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (currentBuilder != null) { + currentBuilder.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) + throws SAXException { + currentBuilder = null; + if ("page".equals(qName)) { + endPage(); + } + } + + + public void parse(final File file) throws ParserConfigurationException, + SAXException, IOException { + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + parser.parse(file, this); + } + + int pageCount = 0; + private void endPage() { + title = titleBuilder.toString(); + ++pageCount; + if (pageCount % 1000 == 0) { + System.out.println("pageCount=" + pageCount); + } + if (title.startsWith("Wiktionary:") || + title.startsWith("Template:") || + title.startsWith("Appendix:") || + title.startsWith("Category:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("TransWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Help:")) { + return; + } + currentDepth = 0; + words.clear(); + currentHeading = null; + insidePartOfSpeech = false; +// System.err.println("Working on page: " + title); + try { + WikiParser.parse(textBuilder.toString(), this); + } catch (Throwable e) { + System.err.println("Failure on page: " + title); + e.printStackTrace(System.err); + } + + for (final WikiWord word : words) { + word.wikiWordToQuickDic(dictBuilder, enIndexBuilder); + } // WikiWord + + } // endPage() + + + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + + /** + * Two things can happen: + * + * We can be in a ==German== section. There we will see English definitions. + * Each POS should get its own QuickDic entry. Pretty much everything goes + * in. + * + * Or we can be in an ==English== section with English definitions + * and maybe see translations for languages we care about. + * + * In either case, we need to differentiate the subsections (Noun, Verb, etc.) + * into separate QuickDic entries, but that's tricky--how do we know when we + * found a subsection? Just ignore anything containing pronunciation and + * etymology? + * + * How do we decide when to seal the deal on an entry? + * + * Would be nice if the parser told us about leaving sections.... + * + * + */ + + String title; + String currentHeading; + int currentDepth; + final List words = new ArrayList(); + WikiWord currentWord; + WikiWord.PartOfSpeech currentPartOfSpeech; + WikiWord.TranslationSense currentTranslationSense; + boolean insidePartOfSpeech; + + StringBuilder wikiBuilder = null; + + @Override + public void onWikiLink(String[] args) { + if (wikiBuilder == null) { + return; + } + wikiBuilder.append(args[args.length - 1]); + } + + // ttbc: translations to be checked. + static final Set useRemainingArgTemplates = new LinkedHashSet(Arrays.asList( + "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo", + "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts", + "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx")); + static final Set ignoreTemplates = new LinkedHashSet(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g")); + static final Set grammarTemplates = new LinkedHashSet(Arrays.asList("impf", "pf", "pf.", "indeclinable")); + static final Set passThroughTemplates = new LinkedHashSet(Arrays.asList("zzzzzzzzzzzzzzz")); + + @Override + public void onTemplate(final List positionalArgs, final Map namedArgs) { + if (positionalArgs.isEmpty()) { + // This happens very rarely with special templates. + return; + } + final String name = positionalArgs.get(0); + + namedArgs.remove("lang"); + namedArgs.remove("nocat"); + namedArgs.remove("nocap"); + namedArgs.remove("sc"); + + // Pronunciation + if (currentWord != null) { + if (name.equals("a")) { + // accent tag + currentWord.currentPronunciation = new StringBuilder(); + currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation); + return; + } + + if (name.equals("IPA") || name.equals("SAMPA") || name.equals("X-SAMPA") || name.equals("enPR")) { + namedArgs.remove("lang"); + for (int i = 0; i < 100 && !namedArgs.isEmpty(); ++i) { + final String pron = namedArgs.remove("" + i); + if (pron != null) { + positionalArgs.add(pron); + } else { + if (i > 10) { + break; + } + } + } + if (!(positionalArgs.size() >= 2 && namedArgs.isEmpty())) { + System.err.println("Invalid pronunciation: " + positionalArgs.toString() + namedArgs.toString()); + } + if (currentWord.currentPronunciation == null) { + currentWord.currentPronunciation = new StringBuilder(); + currentWord.accentToPronunciation.put("", currentWord.currentPronunciation); + } + if (currentWord.currentPronunciation.length() > 0) { + currentWord.currentPronunciation.append("; "); + } + for (int i = 1; i < positionalArgs.size(); ++i) { + if (i > 1) { + currentWord.currentPronunciation.append(","); + } + final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll(""); + currentWord.currentPronunciation.append(pron).append(""); + } + currentWord.currentPronunciation.append(" (").append(name).append(")"); + return; + } + + if (name.equals("qualifier")) { + //assertTrue(positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString()); + if (wikiBuilder == null) { + return; + } + wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); + return; + } + + if (name.equals("...")) { + // Skipping any elided text for brevity. + wikiBuilder.append("..."); + return; + } + + if (passThroughTemplates.contains(name)) { + assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs); + wikiBuilder.append(name); + return; + } + + if (ignoreTemplates.contains(name)) { + return; + } + + if ("Pronunciation".equals(currentHeading)) { + System.err.println("Unhandled pronunciation template: " + positionalArgs + namedArgs); + return; + } + } // Pronunciation + + // Part of speech + if (insidePartOfSpeech) { + + // form of + if (name.equals("form of")) { + namedArgs.remove("sc"); + if (positionalArgs.size() < 3 || positionalArgs.size() > 4) { + System.err.println("Invalid form of."); + } + final String token = positionalArgs.get(positionalArgs.size() == 3 ? 2 : 3); + final String grammarForm = WikiParser.simpleParse(positionalArgs.get(1)); + currentPartOfSpeech.formOfs.add(new FormOf(grammarForm, token)); + return; + } + + // The fallback plan: append the template! + if (wikiBuilder != null) { + wikiBuilder.append("{"); + boolean first = true; + for (final String arg : positionalArgs) { + if (!first) { + wikiBuilder.append(", "); + } + first = false; + wikiBuilder.append(arg); + } + // This one isn't so useful. + for (final Map.Entry entry : namedArgs.entrySet()) { + if (!first) { + wikiBuilder.append(", "); + } + first = false; + wikiBuilder.append(entry.getKey()).append("=").append(entry.getValue()); + } + wikiBuilder.append("}"); + } + + //System.err.println("Unhandled part of speech template: " + positionalArgs + namedArgs); + return; + } // Part of speech + + + // Translations + if (name.equals("trans-top")) { + assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs + title); + + if (currentPartOfSpeech == null) { + assertTrue(currentWord != null && !currentWord.partsOfSpeech.isEmpty(), title); + System.err.println("Assuming last part of speech for non-nested translation section: " + title); + currentPartOfSpeech = ListUtil.getLast(currentWord.partsOfSpeech); + } + + currentTranslationSense = new WikiWord.TranslationSense(); + currentPartOfSpeech.translationSenses.add(currentTranslationSense); + if (positionalArgs.size() > 1) { + currentTranslationSense.sense = positionalArgs.get(1); + } + return; + } // Translations + + if (wikiBuilder == null) { + return; + } + if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) { + assertTrue(positionalArgs.size() >= 1 && namedArgs.isEmpty(), positionalArgs.toString() + namedArgs.toString()); + wikiBuilder.append("{"); + for (int i = 1; i < positionalArgs.size(); ++i) { + wikiBuilder.append(i > 1 ? "," : ""); + wikiBuilder.append(positionalArgs.get(i)); + } + wikiBuilder.append(name).append("}"); + + } else if (name.equals("p")) { + assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty()); + wikiBuilder.append("pl."); + + } else if (name.equals("s")) { + assertTrue(positionalArgs.size() == 1 && namedArgs.isEmpty() || title.equals("dobra"), title); + wikiBuilder.append("sg."); + + } else if (grammarTemplates.contains(name)) { + assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; + wikiBuilder.append(name).append("."); + + } else if (name.equals("l")) { + // This template is designed to generate a link to a specific language-section on the target page. + wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2)); + + } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) { + if (positionalArgs.size() > 2) { + wikiBuilder.append(positionalArgs.get(2)); + } + for (int i = 3; i < positionalArgs.size(); ++i) { + wikiBuilder.append(i == 3 ? " {" : ","); + wikiBuilder.append(positionalArgs.get(i)); + wikiBuilder.append(i == positionalArgs.size() - 1 ? "}" : ""); + } + final String transliteration = namedArgs.remove("tr"); + if (transliteration != null) { + wikiBuilder.append(" (").append(transliteration).append(")"); + } + + } else if (name.equals("trreq")) { + wikiBuilder.append("{{trreq}}"); + + } else if (name.equals("qualifier")) { + //assert positionalArgs.size() == 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString(); + wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")"); + + } else if (useRemainingArgTemplates.contains(name)) { + for (int i = 1; i < positionalArgs.size(); ++i) { + if (i != 1) { + wikiBuilder.append(", "); + } + wikiBuilder.append(positionalArgs.get(i)); + } + } else if (ignoreTemplates.contains(name)) { + // Do nothing. + + } else if (name.equals("initialism")) { + assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; + wikiBuilder.append("Initialism"); + } else if (name.equals("abbreviation")) { + assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; + wikiBuilder.append("Abbreviation"); + } else if (name.equals("acronym")) { + assert positionalArgs.size() <= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs; + wikiBuilder.append("Acronym"); + } else { + if (currentTranslationSense != null) { + System.err.println("Unhandled template: " + positionalArgs.toString() + namedArgs); + } + } + } + + @Override + public void onText(String text) { + if (wikiBuilder != null) { + wikiBuilder.append(text); + return; + } + } + + @Override + public void onHeadingStart(int depth) { + wikiBuilder = new StringBuilder(); + currentDepth = depth; + if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { + currentPartOfSpeech = null; + insidePartOfSpeech = false; + } + if (currentWord != null && depth <= currentWord.depth) { + currentWord = null; + } + + currentHeading = null; + } + + @Override + public void onHeadingEnd(int depth) { + final String name = wikiBuilder.toString().trim(); + wikiBuilder = null; + currentTranslationSense = null; + currentHeading = name; + + final boolean lang0 = langPatterns[0].matcher(name).matches(); + final boolean lang1 = langPatterns[1].matcher(name).matches(); + if (name.equalsIgnoreCase("English") || lang0 || lang1 || name.equalsIgnoreCase("Translingual")) { + currentWord = new WikiWord(title, depth); + if (lang0 && lang1) { + System.err.println("Word is indexed in both index1 and index2: " + title); + } + currentWord.language = name; + currentWord.index = lang0 ? 0 : (lang1 ? 1 : -1); + words.add(currentWord); + return; + } + + if (currentWord == null) { + return; + } + + if (currentPartOfSpeech != null && depth <= currentPartOfSpeech.depth) { + currentPartOfSpeech = null; + } + + insidePartOfSpeech = false; + if (currentPartOfSpeech == null && partOfSpeechHeader.matcher(name).matches()) { + currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name); + currentWord.partsOfSpeech.add(currentPartOfSpeech); + insidePartOfSpeech = true; + return; + } + + if (name.equals("Translations")) { + if (currentWord == null || + !currentWord.language.equals("English") || + currentPartOfSpeech == null) { + System.err.println("Unexpected Translations section: " + title); + return; + } + currentTranslationSense = new WikiWord.TranslationSense(); + } + + } + + @Override + public void onListItemStart(String header, int[] section) { + wikiBuilder = new StringBuilder(); + if (currentWord != null) { + currentWord.currentPronunciation = null; + } + } + + + @Override + public void onListItemEnd(String header, int[] section) { + String item = wikiBuilder.toString().trim(); + if (item.length() == 0) { + return; + } + item = WikiParser.simpleParse(item); + wikiBuilder = null; + + // Part of speech + if (insidePartOfSpeech) { + assert currentPartOfSpeech != null : title + item; + if (header.equals("#") || + header.equals("##") || + header.equals("###") || + header.equals("####") || + header.equals(":#") || + header.equals("::") || + header.equals(":::*")) { + // Definition. + // :: should append, probably. + currentPartOfSpeech.newMeaning().meaning = item; + + // Source + } else if (header.equals("#*") || + header.equals("##*") || + header.equals("###*")) { + currentPartOfSpeech.lastMeaning().newExample().source = item; + + // Example + } else if (header.equals("#:") || + header.equals("#*:") || + header.equals("#:*") || + header.equals("##:") || + header.equals("##*:") || + header.equals("#:*:") || + header.equals("#:*#") || + header.equals("#*:") || + header.equals("*:") || + header.equals("#:::") || + header.equals("#**") || + header.equals("#*:::") || + header.equals("#:#") || + header.equals(":::") || + header.equals("##:*") || + header.equals("###*:")) { + StringUtil.appendLine(currentPartOfSpeech.lastMeaning().newExample().example, item); + + // Example in English + } else if (header.equals("#::") || + header.equals("#*::") || + header.equals("#:**") || + header.equals("#*#") || + header.equals("##*::")) { + StringUtil.appendLine(currentPartOfSpeech.lastMeaning().lastExample().exampleInEnglish, item); + + // Skip + } else if (header.equals("*") || + header.equals("**") || + header.equals("***") || + header.equals("*#") || + header.equals(":") || + header.equals("::*") || + header.equals("#**") || + header.equals(":*") || + header.equals("#*:*") || + header.equals("#*:**") || + header.equals("#*:#") || + header.equals("#*:*:") || + header.equals("#*:*") || + header.equals(";")) { + // might have: * {{seeCites}} + // * [[w:Arabic numerals|Arabic numerals]]: 2 + //assert item.trim().length() == 0; + System.err.println("Skipping meaning: " + header + " " + item); + } else { + if (title.equals("Yellowknife")) { + return; + } + System.err.println("Busted heading: " + title + " "+ header + " " + item); + } + return; + } + // Part of speech + + // Translation + if (currentTranslationSense != null) { + if (item.indexOf("{{[trreq]{}}}") != -1) { + return; + } + + if (currentPartOfSpeech.translationSenses.isEmpty()) { + currentPartOfSpeech.translationSenses.add(currentTranslationSense); + } + + final int colonPos = item.indexOf(':'); + if (colonPos == -1) { + System.err.println("Invalid translation: title=" + title + ", item=" + item); + return; + } + final String lang = item.substring(0, colonPos); + final String trans = item.substring(colonPos + 1).trim(); + for (int i = 0; i < 2; ++i) { + if (langPatterns[i].matcher(lang).find()) { + currentTranslationSense.translations.get(i).add(new Translation(lang, trans)); + } + } + } // Translation + } + + @Override + public void onNewLine() { + } + + @Override + public void onNewParagraph() { + } + + // ---------------------------------------------------------------------- + + @Override + public void onComment(String text) { + } + + @Override + public void onFormatBold(boolean boldOn) { + } + + @Override + public void onFormatItalic(boolean italicOn) { + } + + @Override + public void onUnterminated(String start, String rest) { + System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest); + } + @Override + public void onInvalidHeaderEnd(String rest) { + throw new RuntimeException(rest); + } + +} diff --git a/src/com/hughes/android/dictionary/parser/WikiHeading.java b/src/com/hughes/android/dictionary/parser/WikiHeading.java index b8ca6f9..1b6aeee 100644 --- a/src/com/hughes/android/dictionary/parser/WikiHeading.java +++ b/src/com/hughes/android/dictionary/parser/WikiHeading.java @@ -3,10 +3,12 @@ package com.hughes.android.dictionary.parser; public class WikiHeading { public final int depth; public final String name; + public final String prefix; - public WikiHeading(int depth, String name) { + public WikiHeading(int depth, String name, String prefix) { this.depth = depth; this.name = name; + this.prefix = prefix; } public static WikiHeading getHeading(String line) { @@ -22,7 +24,7 @@ public class WikiHeading { System.err.println("Invalid heading: " + line); return null; } - return new WikiHeading(i, line.substring(i, line.length() - i).trim()); + return new WikiHeading(i, line.substring(i, line.length() - i).trim(), prefix); } } diff --git a/src/com/hughes/android/dictionary/parser/WikiTokenizer.java b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java new file mode 100644 index 0000000..d028acb --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/WikiTokenizer.java @@ -0,0 +1,206 @@ +package com.hughes.android.dictionary.parser; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public final class WikiTokenizer { + + //private static final Pattern wikiTokenEvent = Pattern.compile("($)", Pattern.MULTILINE); + private static final Pattern wikiTokenEvent = Pattern.compile("(\\{\\{|\\}\\}|\\[\\[|\\]\\]|", "\n"); + return this; + } + + if (wikiText.startsWith("}}", start) || wikiText.startsWith("]]", start)) { + System.err.println("Close without open!"); + end += 2; + return this; + } + + + if (this.matcher.find(start)) { + end = this.matcher.start(1); + if (end == start) { + System.err.println(this.matcher.group()); + assert false; + } + return this; + } + + end = wikiText.length(); + return this; + + } + + public String token() { + return wikiText.substring(start, end); + } + + private int escapedFind(final int start, final String toFind) { + assert tokenStack.isEmpty(); + + int end = start; + while (end < wikiText.length()) { + if (matcher.find(end)) { + final String matchText = matcher.group(); + final int matchStart = matcher.start(); + + if (matchText.length() == 0) { + assert matchStart == wikiText.length() || wikiText.charAt(matchStart) == '\n'; + if (tokenStack.isEmpty() && toFind.equals("\n")) { + return matchStart; + } + ++end; + } else if (tokenStack.isEmpty() && matchText.equals(toFind)) { + // The normal return.... + return matcher.end(); + } else if (matchText.equals("[[") || matchText.equals("{{")) { + tokenStack.add(matchText); + } else if (matchText.equals("]]") || matchText.equals("}}")) { + if (tokenStack.size() > 0) { + final String removed = tokenStack.remove(tokenStack.size() - 1); + if (removed.equals("{{") && !matcher.group().equals("}}")) { + System.err.println("Unmatched {{ error: " + wikiText.substring(start)); + return safeIndexOf(wikiText, start, "\n", "\n"); + } else if (removed.equals("[[") && !matcher.group().equals("]]")) { + System.err.println("Unmatched [[ error: " + wikiText.substring(start)); + return safeIndexOf(wikiText, start, "\n", "\n"); + } + } else { + System.err.println("Pop too many error: " + wikiText.substring(start).replaceAll("\n", "\\n")); + // If we were looking for a newline + return safeIndexOf(wikiText, start, "\n", "\n"); + } + } else if (matchText.equals(""); + if (end == -1) { + System.err.println("Unmatched '''pretty''' cool '''''over''''' there." + "\n" + + "hi " + "\n" + + "" + "\n" + + "asdf\n" + + "{{template_not_in_list}}" + "\n" + + "# {{template_in_list}}" + "\n" + + "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list + ": but this is a list!" + "\n" + + "*:* and so is this :::" + "\n" + + "here's [[some blah|some]] wikitext." + "\n" + + "here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" + + "blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" + + "== Header 2 ==" + "\n" + + "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" + + "{{mismatched]]" + "\n" + + "[[mismatched}}" + "\n" + + "{extraterminated}}" + "\n" + + "[extraterminated]]" + "\n" + + "=== {{header-template}} ===" + "\n"; + + final String[] expectedTokens = new String[] { + "Hi", + "\n", + "Hello thad you're ", + "", + " ", + "'''", + "pretty", + "'''", + " cool ", + "'''", + "''", + "over", + "'''", + "''", + " there.", + "\n", + "hi ", + "", + "\n", + "\n", + "asdf", + "\n", + "{{template_not_in_list}}", + "\n", + "# {{template_in_list}}", + "\n", + "[[wikitext]]", + ":", + "[[wikitext]]", + "\n", + ": but this is a list!", + "\n", + "*:* and so is this :::", + "\n", + "here's ", + "[[some blah|some]]", + " wikitext.", + "\n", + "here's a ", + "{{template|this has an = sign|blah=2|blah2=3|\nblah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}}", + " and some more text.", + "\n", + "== Header 2 ==", + "\n", + "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}", + "\n", + "{{mismatched]]\n", + "[[mismatched}}\n", + "{extraterminated", + "}}", + "\n", + "[extraterminated", + "]]", + "\n", + "=== {{header-template}} ===", + "\n", + }; + + final List actualTokens = new ArrayList(); + + final WikiTokenizer wikiTokenizer = new WikiTokenizer(wikiText); + WikiTokenizer token; + int i = 0; + while ((token = wikiTokenizer.nextToken()) != null) { + actualTokens.add(token.token()); + System.out.println("\"" + token.token().replace("\n", "\\n") + "\","); + assertEquals(expectedTokens[i++], token.token()); + } + assertEquals(Arrays.asList(expectedTokens), actualTokens); + } + + public void testWikiHeading() { + assertNull(WikiHeading.getHeading("")); + assertNull(WikiHeading.getHeading("=")); + assertNull(WikiHeading.getHeading("==")); + assertNull(WikiHeading.getHeading("=a")); + assertNull(WikiHeading.getHeading("=a==")); + assertNull(WikiHeading.getHeading("===a==")); + assertNull(WikiHeading.getHeading("===a====")); + assertNull(WikiHeading.getHeading("a=")); + assertEquals("a", WikiHeading.getHeading("=a=").name); + assertEquals(1, WikiHeading.getHeading("=a=").depth); + assertEquals("aa", WikiHeading.getHeading("==aa==").name); + assertEquals(2, WikiHeading.getHeading("==aa==").depth); + } + + + public void testWikiFunction() { + assertNull(WikiFunction.getFunction("")); + assertNull(WikiFunction.getFunction("[[asdf]]")); + assertNull(WikiFunction.getFunction("asd [[asdf]]asdf ")); + assertEquals("a", WikiFunction.getFunction("{{a}}").name); + assertEquals("a", WikiFunction.getFunction("{{a|b}}").name); + assertEquals("a", WikiFunction.getFunction("a{{a|b}}a").name); + assertEquals("a[[a]]", WikiFunction.getFunction("a{{a[[a]]|b}}a").name); + assertEquals("a", WikiFunction.getFunction("a{{a|b[[abc|def]]|[[fgh|jkl]]|qwer}}a").name); + assertEquals(Arrays.asList("b[[abc|d=f]]", "qwer", "[[fgh|jkl]]", "qwer"), WikiFunction.getFunction("a{{a|b[[abc|d=f]]|qwer|[[fgh|jkl]]|qwer}}a").args); + assertEquals("[[abc|def]]", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("b")); + assertEquals("{{asdf}}", WikiFunction.getFunction("a{{a|b=[[abc|def]]|qwer|[[fgh|jkl]]|qwer={{asdf}}}}a").namedArgs.get("qwer")); + } + +} diff --git a/src/com/hughes/android/dictionary/parser/WikiWord.java.old b/src/com/hughes/android/dictionary/parser/WikiWord.java.old new file mode 100644 index 0000000..96f3321 --- /dev/null +++ b/src/com/hughes/android/dictionary/parser/WikiWord.java.old @@ -0,0 +1,339 @@ +package com.hughes.android.dictionary.parser; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import com.hughes.android.dictionary.engine.DictionaryBuilder; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.PairEntry; +import com.hughes.android.dictionary.engine.PairEntry.Pair; +import com.hughes.util.ListUtil; + +public class WikiWord { + final int depth; + + final String title; + String language; + + int index; + + final Map accentToPronunciation = new LinkedHashMap(); + StringBuilder currentPronunciation = null; + + final List partsOfSpeech = new ArrayList(); + + public WikiWord(final String title, int depth) { + this.title = title.intern(); + this.depth = depth; + } + + static class PartOfSpeech { + final int depth; + final String name; + + final List meanings = new ArrayList(); + + final List translationSenses = new ArrayList(); + + final List formOfs = new ArrayList(); + + public PartOfSpeech(final int depth, String name) { + this.depth = depth; + this.name = name.intern(); + } + + public Meaning newMeaning() { + final Meaning meaning = new Meaning(); + meanings.add(meaning); + return meaning; + } + + public Meaning lastMeaning() { + return meanings.isEmpty() ? newMeaning() : ListUtil.getLast(meanings); + } + } + + static class TranslationSense { + String sense; + List> translations = new ArrayList>(); + { + translations.add(new ArrayList()); + translations.add(new ArrayList()); + } + } + + static class Translation { + String language; + String text; + + public Translation(final String language, final String text) { + this.language = language; + this.text = text; + } + + @Override + public String toString() { + return language + ": " + text; + } + } + + static class FormOf { + final String grammarForm; + final String target; + + public FormOf(final String grammarForm, final String token) { + this.grammarForm = grammarForm; + this.target = token; + } + } + + static class Meaning { + String meaning; + final List examples = new ArrayList(); + + public Example newExample() { + final Example example = new Example(); + this.examples.add(example); + return example; + } + + public Example lastExample() { + return examples.isEmpty() ? newExample() : ListUtil.getLast(examples); + } + } + + static class Example { + String source; + final StringBuilder example = new StringBuilder(); + final StringBuilder exampleInEnglish = new StringBuilder(); + } + + // ------------------------------------------------------------------------- + + void wikiWordToQuickDic(final DictionaryBuilder dictBuilder, final int enIndexBuilder) { + //System.out.println("\n" + title + ", " + language + ", pron=" + accentToPronunciation); + if (partsOfSpeech.isEmpty() && title.indexOf(":") == -1 && !language.equals("Translingual")) { + System.err.println("Word with no POS: " + title); + } + for (final WikiWord.PartOfSpeech partOfSpeech : partsOfSpeech) { + partOfSpeechToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech); + } // PartOfSpeech + + // Pronunciation. + if (index != -1) { + final PairEntry pronEntry = new PairEntry(); + for (final Map.Entry accentToPron : accentToPronunciation.entrySet()) { + String accent = accentToPron.getKey(); + if (accent.length() > 0) { + accent = accent + ": "; + } + pronEntry.pairs.add(new Pair(accent + accentToPron.getValue(), "", index != 0)); + } + if (pronEntry.pairs.size() > 0) { + final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pronEntry); + dictBuilder.dictionary.pairEntries.add(pronEntry); + final Set tokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR); + dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_PRONUNCIATION); + } + } + } + + + static final Pattern templateName = Pattern.compile("\\{[^,]*,"); + private void partOfSpeechToQuickDic(final DictionaryBuilder dictBuilder, + final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) { + //System.out.println(" pos: " + partOfSpeech.name); + + for (final WikiWord.Meaning meaning : partOfSpeech.meanings) { + //System.out.println(" meaning: " + meaning.meaning); + for (final WikiWord.Example example : meaning.examples) { + if (example.example.length() > 0) { + //System.out.println(" example: " + example.example); + } + if (example.exampleInEnglish.length() > 0) { + //System.out.println(" exampleInEnglish: " + example.exampleInEnglish); + } + } + } + + if (index != -1) { + final boolean formOfSwap = index != 0; + for (final FormOf formOf : partOfSpeech.formOfs) { + final Pair pair = new Pair(title + ": " + formOf.grammarForm + ": " + formOf.target, "", formOfSwap); + final PairEntry pairEntry = new PairEntry(); + pairEntry.pairs.add(pair); + final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); + dictBuilder.dictionary.pairEntries.add(pairEntry); + + // File under title token. + final Set tokens = DictFileParser.tokenize(formOf.target, DictFileParser.NON_CHAR); + dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, tokens, EntryTypeName.WIKTIONARY_FORM_OF); + } + } + + + if (enIndexBuilder != -1 && index != -1 && enIndexBuilder != index) { + final String entryBase = title + " (" + partOfSpeech.name.toLowerCase() + ")"; + final boolean swap = enIndexBuilder == 1; + + // Meanings. + for (final Meaning meaning : partOfSpeech.meanings) { + final PairEntry pairEntry = new PairEntry(); + final List pairs = pairEntry.pairs; + + final List> exampleTokens = new ArrayList>(); + exampleTokens.add(new LinkedHashSet()); + exampleTokens.add(new LinkedHashSet()); + + if (meaning.meaning != null && meaning.meaning.length() > 0) { + final Pair meaningPair = new Pair(meaning.meaning, entryBase, swap); + pairs.add(meaningPair); + } else { + System.err.println("Empty meaning: " + title + ", " + language + ", " + partOfSpeech.name); + } + + // Examples + for (final Example example : meaning.examples) { + final int dashIndex = example.example.indexOf("—"); + if (example.exampleInEnglish.length() == 0 && dashIndex != -1) { + System.out.println("Splitting example: title=" + title + ", "+ example.example); + example.exampleInEnglish.append(example.example.substring(dashIndex + 1).trim()); + example.example.delete(dashIndex, example.example.length()); + } + + if (example.example.length() > 0 && example.exampleInEnglish.length() > 0) { + final Pair pair = new Pair(example.exampleInEnglish.toString(), example.example.toString(), swap); + pairs.add(pair); + + for (int i = 0; i < 2; ++i) { + exampleTokens.get(i).addAll(DictFileParser.tokenize(pair.get(i), DictFileParser.NON_CHAR)); + } + } + } + + // Create EntryData with the PairEntry. + final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); + dictBuilder.dictionary.pairEntries.add(pairEntry); + + // File under title token. + final Set titleTokens = DictFileParser.tokenize(title, DictFileParser.NON_CHAR); + dictBuilder.indexBuilders.get(index).addEntryWithTokens(entryData, titleTokens, titleTokens.size() == 1 ? EntryTypeName.WIKTIONARY_TITLE_ONE_WORD : EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD); + + // File under the meaning tokens (English): + if (meaning.meaning != null) { + // If the meaning contains any templates, strip out the template name + // so we don't index it. + final String meaningToIndex = templateName.matcher(meaning.meaning).replaceAll(""); + final Set meaningTokens = DictFileParser.tokenize(meaningToIndex, DictFileParser.NON_CHAR); + dictBuilder.indexBuilders.get(enIndexBuilder).addEntryWithTokens(entryData, meaningTokens, meaningTokens.size() == 1 ? EntryTypeName.WIKTIONARY_MEANING_ONE_WORD : EntryTypeName.WIKTIONARY_MEANING_MULTI_WORD); + } + + // File under other tokens that we saw. + for (int i = 0; i < 2; ++i) { + dictBuilder.indexBuilders.get(i).addEntryWithTokens(entryData, exampleTokens.get(i), EntryTypeName.WIKTIONARY_EXAMPLE_OTHER_WORDS); + } + + + } // Meanings. + + } + + translationSensesToQuickDic(dictBuilder, enIndexBuilder, partOfSpeech); + } + + + private void translationSensesToQuickDic(final DictionaryBuilder dictBuilder, + final int enIndexBuilder, final WikiWord.PartOfSpeech partOfSpeech) { + if (!partOfSpeech.translationSenses.isEmpty()) { + if (!language.equals("English")) { + System.err.println("Translation sections not in English."); + } + + final String englishBase = title + " (" + partOfSpeech.name.toLowerCase() + "%s)"; + + for (final TranslationSense translationSense : partOfSpeech.translationSenses) { + //System.out.println(" sense: " + translationSense.sense); + if (translationSense.sense == null) { + //System.err.println(" null sense: " + title); + } + String englishSense = String.format(englishBase, translationSense.sense != null ? (": " + translationSense.sense) : ""); + + final StringBuilder[] sideBuilders = new StringBuilder[2]; + final List>> sideTokens = new ArrayList>>(); + for (int i = 0; i < 2; ++i) { + sideBuilders[i] = new StringBuilder(); + sideTokens.add(new LinkedHashMap>()); + } + + if (enIndexBuilder != -1) { + sideBuilders[enIndexBuilder].append(englishSense); + addTokens(title, sideTokens.get(enIndexBuilder), EntryTypeName.WIKTIONARY_TITLE_ONE_WORD); + } + + // Get the entries from the translation section. + for (int i = 0; i < 2; ++i) { + //System.out.println(" lang: " + i); + for (final Translation translation : translationSense.translations.get(i)) { + //System.out.println(" translation: " + translation); + sideBuilders[i].append(sideBuilders[i].length() > 0 ? "\n" : ""); + if (translationSense.translations.get(i).size() > 1) { + sideBuilders[i].append(translation.language).append(": "); + } + sideBuilders[i].append(translation.text); + + // TODO: Don't index {m}, {f} + // TODO: Don't even show: (1), (1-2), etc. + addTokens(translation.text, sideTokens.get(i), EntryTypeName.WIKTIONARY_TRANSLATION_ONE_WORD); + } + } + + // Construct the Translations-based QuickDic entry for this TranslationSense. + if (sideBuilders[0].length() > 0 && sideBuilders[1].length() > 0) { + final Pair pair = new Pair(sideBuilders[0].toString(), sideBuilders[1].toString()); + final PairEntry pairEntry = new PairEntry(); + pairEntry.pairs.add(pair); + final IndexedEntry entryData = new IndexedEntry(dictBuilder.dictionary.pairEntries.size(), pairEntry); + dictBuilder.dictionary.pairEntries.add(pairEntry); + + // Add the EntryData to the indices under the correct tokens. + for (int i = 0; i < 2; ++i) { + final IndexBuilder indexBuilder = dictBuilder.indexBuilders.get(i); + for (final Map.Entry> entry : sideTokens.get(i).entrySet()) { + for (final String token : entry.getValue()) { + final List entries = indexBuilder.getOrCreateEntries(token, entry.getKey()); + entries.add(entryData); + } + } + + } + + } + } // Senses + } // Translations + } + + + static void addTokens(final String text, final Map> map, + EntryTypeName entryTypeName) { + final Set tokens = DictFileParser.tokenize(text, DictFileParser.NON_CHAR); + if (tokens.size() > 1 && entryTypeName == EntryTypeName.WIKTIONARY_TITLE_ONE_WORD) { + entryTypeName = EntryTypeName.WIKTIONARY_TITLE_MULTI_WORD; + } + List tokenList = map.get(entryTypeName); + if (tokenList == null) { + tokenList = new ArrayList(); + map.put(entryTypeName, tokenList); + } + tokenList.addAll(tokens); + } + + + +} -- 2.43.0