public class DictionaryBuilderTest extends TestCase {
- public void testGermanCombined() throws Exception {
- final File result = new File("testdata/de-en.quickdic");
+ public void testWiktionaryCombined() throws Exception {
+ final File result = new File("testdata/wiktionary.quickdic");
System.out.println("Writing to: " + result);
DictionaryBuilder.main(new String[] {
"--dictOut=" + result.getAbsolutePath(),
"--lang1=DE",
"--lang2=EN",
- "--dictInfo=@testdata/de-en_dictInfo.txt",
-
-// "--input1=testdata/de-en_chemnitz_100",
-// "--input1Name=dictcc",
-// "--input1Charset=UTF8",
-// "--input1Format=chemnitz",
-//
-// "--input2=testdata/de-en_dictcc_100",
-// "--input2Name=dictcc",
-// "--input2Charset=UTF8",
-// "--input2Format=dictcc",
+ "--dictInfo=SomeWikiData",
"--input3=testdata/enwiktionary_small.xml",
"--input3Name=enwiktionary",
"--input3Format=enwiktionary",
- "--input3TranslationPattern1=[Gg]erman",
- "--input3TranslationPattern2=[Ee]glish",
+ "--input3TranslationPattern1=German|Italian|Spanish|French|Japanese|Arabic|Mandarin",
+ "--input3TranslationPattern2=English",
"--input3EnIndex=2",
+ "--print=testdata/wiktionary.test",
+ });
+
+ // Check it once:
+ assertFilesEqual("testdata/wiktionary.golden", "testdata/wiktionary.test");
+
+
+ // Check it again.
+ final Dictionary dict = new Dictionary(new RandomAccessFile(result.getAbsolutePath(), "r"));
+ final PrintStream out = new PrintStream(new File("testdata/wiktionary.test"));
+ dict.print(out);
+ out.close();
+
+ assertFilesEqual("testdata/wiktionary.golden", "testdata/wiktionary.test");
+ }
+
+
+ public void testGermanCombined() throws Exception {
+ if (1==1) throw new RuntimeException();
+ final File result = new File("testdata/de-en.quickdic");
+ System.out.println("Writing to: " + result);
+ DictionaryBuilder.main(new String[] {
+ "--dictOut=" + result.getAbsolutePath(),
+ "--lang1=DE",
+ "--lang2=EN",
+ "--dictInfo=@testdata/de-en_dictInfo.txt",
+
+ "--input1=testdata/de-en_chemnitz_100",
+ "--input1Name=dictcc",
+ "--input1Charset=UTF8",
+ "--input1Format=chemnitz",
+
+ "--input2=testdata/de-en_dictcc_100",
+ "--input2Name=dictcc",
+ "--input2Charset=UTF8",
+ "--input2Format=dictcc",
+
"--print=testdata/de-en.test",
});
assertFilesEqual("testdata/de-en.golden", "testdata/de-en.test");
}
-
+
+
void assertFilesEqual(final String expected, final String actual) throws IOException {
final String expectedString = FileUtil.readToString(new File(expected));
assertEquals(expectedString, actualString);
}
+
}
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import com.hughes.android.dictionary.engine.DictionaryBuilder;
import com.hughes.android.dictionary.engine.IndexBuilder;
+import com.hughes.android.dictionary.parser.WikiWord.TranslationSection;
public class EnWiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler implements WikiCallback {
+
+ static final Pattern partOfSpeechHeader = Pattern.compile(
+ "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
+ "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
+ "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
+ "Ligature|Idiom|Phrase|" +
+ // These are @deprecated:
+ "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
+ "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
+
+ static final Pattern wikiMarkup = Pattern.compile("\\[\\[|\\]\\]|''+");
+
final DictionaryBuilder dict;
title = titleBuilder.toString();
currentDepth = 0;
words.clear();
+ currentHeading = null;
WikiParser.parse(textBuilder.toString(), this);
+
+ for (final WikiWord word : words) {
+ System.out.println("\n" + title + ", " + word.language + ", pron=" + word.accentToPronunciation);
+ if (word.partsOfSpeech.isEmpty() && title.indexOf(":") == -1) {
+ System.err.println("Word with no POS: " + title);
+ }
+ for (final WikiWord.PartOfSpeech partOfSpeech : word.partsOfSpeech) {
+ System.out.println(" pos: " + partOfSpeech.name);
+
+ for (final TranslationSection translationSection : partOfSpeech.translationSections) {
+ System.out.println(" sense: " + translationSection.sense);
+
+ }
+ }
+ }
}
+
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+ // ------------------------------------------------------------------------
+
/**
* Two things can happen:
*
*/
String title;
+ String currentHeading;
int currentDepth;
final List<WikiWord> words = new ArrayList<WikiWord>();
WikiWord currentWord;
StringBuilder wikiBuilder = null;
- // ------------------------------------------------------------------------
-
@Override
public void onWikiLink(String[] args) {
- if (wikiBuilder != null) {
- wikiBuilder.append(args[args.length - 1]);
+ if (wikiBuilder == null) {
+ return;
}
+ wikiBuilder.append(args[args.length - 1]);
}
+
+ // ttbc: translations to be checked.
+ static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
+ "Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
+ "sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
+ "zh-tsp", "zh-zh-p"));
+ static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
+ static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
@Override
- public void onTemplate(String[][] args) {
- final String name = args[0][1];
+ public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+ final String name = positionalArgs.get(0);
+
+ // Pronunciation
+ if (name.equals("a")) {
+ // accent tag
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put(positionalArgs.get(1), currentWord.currentPronunciation);
+ return;
+ }
+ if (name.equals("IPA") || name.equals("SAMPA") || name.equals("enPR") || name.equals("rhymes")) {
+ namedArgs.remove("lang");
+ assert positionalArgs.size() >= 2 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs.toString();
+ if (currentWord.currentPronunciation == null) {
+ currentWord.currentPronunciation = new StringBuilder();
+ currentWord.accentToPronunciation.put("", currentWord.currentPronunciation);
+ }
+ currentWord.currentPronunciation.append(name).append(": ");
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i > 1) {
+ currentWord.currentPronunciation.append(", ");
+ }
+ final String pron = wikiMarkup.matcher(positionalArgs.get(1)).replaceAll("");
+ currentWord.currentPronunciation.append(pron).append("");
+ }
+ return;
+ }
+ if (name.equals("audio")) {
+ return;
+ }
+ if ("Pronunciation".equals(currentHeading)) {
+ System.err.println("Unhandled template: " + name);
+ }
+
+ // Translations
+ if (name.equals("trans-top")) {
+ assert positionalArgs.size() == 2 && namedArgs.isEmpty();
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+ if (positionalArgs.size() > 1) {
+ currentTranslationSection.sense = positionalArgs.get(1);
+ }
+ return;
+ }
+
+ if (wikiBuilder == null) {
+ return;
+ }
if (name == "") {
-
+ } else if (name.equals("m") || name.equals("f") || name.equals("n") || name.equals("c")) {
+ wikiBuilder.append("{").append(name).append("}");
+ } else if (name.equals("p")) {
+ wikiBuilder.append("pl.");
+ } else if (name.equals("s")) {
+ wikiBuilder.append("sg.");
+ } else if (grammarTemplates.contains(name)) {
+ wikiBuilder.append(name).append(".");
+ } else if (name.equals("l")) {
+ wikiBuilder.append(positionalArgs.size() >= 4 ? positionalArgs.get(3) : positionalArgs.get(2));
+ } else if (name.equals("t") || name.equals("t+") || name.equals("t-") || name.equals("tø")) {
+ if (positionalArgs.size() >= 2) {
+ wikiBuilder.append(positionalArgs.get(1));
+ }
+ if (positionalArgs.size() >= 3) {
+ wikiBuilder.append(" {").append(positionalArgs.get(1)).append("}");
+ }
+ final String transliteration = namedArgs.remove("tr");
+ if (transliteration != null) {
+ wikiBuilder.append(" (").append(transliteration).append(")");
+ }
+ } else if (name.equals("trreq")) {
+ wikiBuilder.append("{{trreq}}");
+ } else if (name.equals("qualifier")) {
+ wikiBuilder.append(" (").append(positionalArgs.get(1)).append(")");
+ } else if (useRemainingArgTemplates.contains(name)) {
+ for (int i = 1; i < positionalArgs.size(); ++i) {
+ if (i != 1) {
+ wikiBuilder.append(", ");
+ }
+ wikiBuilder.append(positionalArgs.get(i));
+ }
+ } else if (ignoreTemplates.contains(name)) {
+ } else if (name.equals("initialism")) {
+ wikiBuilder.append("Initialism");
} else {
- //System.out.println("Unhandled template: " + name);
+ if (currentTranslationSection != null) {
+ System.err.println("Unhandled template: " + name);
+ }
}
}
}
}
- final Pattern partOfSpeechHeader = Pattern.compile(
- "Noun|Verb|Adjective|Adverb|Pronoun|Conjunction|Interjection|" +
- "Preposition|Proper noun|Article|Prepositional phrase|Acronym|" +
- "Abbreviation|Initialism|Contraction|Prefix|Suffix|Symbol|Letter|" +
- "Ligature|Idiom|Phrase|" +
- // These are @deprecated:
- "Noun form|Verb form|Adjective form|Nominal phrase|Noun phrase|" +
- "Verb phrase|Transitive verb|Intransitive verb|Reflexive verb");
-
@Override
public void onHeadingEnd(int depth) {
final String name = wikiBuilder.toString().trim();
wikiBuilder = null;
+ currentTranslationSection = null;
+ currentHeading = name;
final boolean lang1 = langPatterns[0].matcher(name).matches();
final boolean lang2 = langPatterns[1].matcher(name).matches();
}
if (partOfSpeechHeader.matcher(name).matches()) {
- currentPartOfSpeech = new WikiWord.PartOfSpeech(depth);
+ currentPartOfSpeech = new WikiWord.PartOfSpeech(depth, name);
currentWord.partsOfSpeech.add(currentPartOfSpeech);
return;
}
}
currentTranslationSection = new WikiWord.TranslationSection();
currentPartOfSpeech.translationSections.add(currentTranslationSection);
- } else {
- currentTranslationSection = null;
}
+
+ if (name.equals("Translations")) {
+ if (currentWord == null ||
+ !currentWord.language.equals("English") ||
+ currentPartOfSpeech == null) {
+ System.out.println("Unexpected Translations section: " + title);
+ return;
+ }
+ currentTranslationSection = new WikiWord.TranslationSection();
+ currentPartOfSpeech.translationSections.add(currentTranslationSection);
+ }
+
}
@Override
public void onListItemStart(String header, int[] section) {
wikiBuilder = new StringBuilder();
+ if (currentWord != null) {
+ currentWord.currentPronunciation = null;
+ }
}
final String item = wikiBuilder.toString();
wikiBuilder = null;
+ if (item.indexOf("{{trreq}}") != -1) {
+ return;
+ }
+
if (currentTranslationSection != null) {
final int colonPos = item.indexOf(':');
if (colonPos == -1) {
- System.out.println("Invalid translation: " + item);
+ System.err.println("Invalid translation: " + item);
return;
}
final String lang = item.substring(0, colonPos);
// ----------------------------------------------------------------------
- public void onTransTrop(final String[][] args) {
- currentTranslationSection = new WikiWord.TranslationSection();
- currentPartOfSpeech.translationSections.add(currentTranslationSection);
-
- if (args.length > 1) {
- currentTranslationSection.sense = args[1][1];
- }
- }
-
-
- // ----------------------------------------------------------------------
-
@Override
public void onComment(String text) {
}
package com.hughes.android.dictionary.parser;
+import java.util.List;
+import java.util.Map;
+
public interface WikiCallback {
void onWikiLink(final String[] args);
- void onTemplate(final String[][] args);
+ void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs);
// Will never contain a newline unless it's in a <pre>
void onText(final String text);
package com.hughes.android.dictionary.parser;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WikiParser {
- private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|^[*#;:]+|^(==+)\\s*|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
- private static final Pattern listStart = Pattern.compile("^[*#;:]");
+ private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
+ private static final Pattern listStart = Pattern.compile("^[*#;:]+");
private static final Pattern pipeSplit = Pattern.compile("\\s*\\|\\s*");
private static final Pattern whitespace = Pattern.compile("\\s+");
+ private static final Pattern headerStart = Pattern.compile("^==+");
+
static void parse(final String wikiText, final WikiCallback callback) {
boolean italicOn = false;
int insideHeaderDepth = -1;
String lastListItem = null;
-
+
+ final List<String> positionalArgs = new ArrayList<String>();
+ final Map<String, String> namedArgs = new LinkedHashMap<String, String>();
+
String rest = wikiText;
while (rest.length() > 0) {
final Matcher matcher = markup.matcher(rest);
rest = rest.substring(nextMarkupPos);
if (rest.startsWith("\n")) {
+ rest = rest.substring(1);
+
if (insideHeaderDepth != -1) {
throw new RuntimeException("barf");
}
if (lastListItem != null) {
callback.onListItemEnd(lastListItem, null);
}
- if (!listStart.matcher(rest.substring(1)).matches()) {
+
+ final Matcher headerMatcher = headerStart.matcher(rest);
+ if (headerMatcher.find()) {
+ insideHeaderDepth = headerMatcher.group().length();
+ callback.onHeadingStart(insideHeaderDepth);
+ rest = rest.substring(headerMatcher.group().length());
+ continue;
+ }
+
+ if (listStart.matcher(rest).find()) {
+ lastListItem = matcher.group();
+ callback.onListItemStart(lastListItem, null);
+ rest = rest.substring(lastListItem.length());
+ continue;
+ } else if (lastListItem != null) {
+ callback.onNewParagraph();
lastListItem = null;
}
- if (rest.startsWith("\n\n")) {
- // TODO(thadh): eat all the newlines.
+
+ if (rest.startsWith("\n")) {
callback.onNewParagraph();
- rest = rest.substring(2);
- } else {
- callback.onNewLine();
- rest = rest.substring(1);
+ continue;
}
+ callback.onNewLine();
} else if (rest.startsWith("'''")) {
boldOn = !boldOn;
callback.onFormatBold(boldOn);
}
final String template = rest.substring(2, end).trim();
final String[] templateArray = pipeSplit.split(template);
- final String[][] templateArgs = new String[templateArray.length][];
+ positionalArgs.clear();
+ namedArgs.clear();
for (int i = 0; i < templateArray.length; ++i) {
int equalPos = templateArray[i].indexOf('=');
if (equalPos == -1) {
- templateArgs[i] = new String[] { null, templateArray[i] };
+ positionalArgs.add(templateArray[i]);
} else {
- templateArgs[i] = new String[] { templateArray[i].substring(0, equalPos), templateArray[i].substring(equalPos + 1) };
+ namedArgs.put(templateArray[i].substring(0, equalPos), templateArray[i].substring(equalPos + 1));
}
}
- callback.onTemplate(templateArgs);
+ callback.onTemplate(positionalArgs, namedArgs);
rest = rest.substring(end + 2);
} else if (rest.startsWith("[[")) {
int end = rest.indexOf("]]");
} else if (rest.startsWith("=")) {
final String match = matcher.group(1) != null ? matcher.group(1) : matcher.group(2);
if (insideHeaderDepth == -1) {
- insideHeaderDepth = match.length();
- callback.onHeadingStart(insideHeaderDepth);
} else {
if (match.length() != insideHeaderDepth) {
callback.onInvalidHeaderEnd(rest);
insideHeaderDepth = -1;
}
rest = rest.substring(match.length());
- } else if (rest.startsWith("*") || rest.startsWith("#") || rest.startsWith(";") || rest.startsWith(":")) {
- lastListItem = matcher.group();
- callback.onListItemStart(lastListItem, null);
- rest = rest.substring(lastListItem.length());
} else if (rest.startsWith("<!--")) {
int end = rest.indexOf("-->");
if (end == -1) {
package com.hughes.android.dictionary.parser;
+import java.util.List;
+import java.util.Map;
+
import junit.framework.TestCase;
public class WikiParserTest extends TestCase {
"multi-line" + "\n" +
"# comment -->" + "\n" +
"" + "\n" +
+ "asdf\n" +
"# li" + "\n" +
"# li2" + "\n" +
"## li2.2" + "\n" +
"Hi again." + "\n" +
+ "[[wikitext]]:[[wikitext]]" + "\n" + // don't want this to trigger a list
"here's [[some blah|some]] wikitext." + "\n" +
"here's a {{template|blah=2|blah2=3|" + "\n" +
"blah3=3}} and some more text." + "\n" +
"== Header 2 ==" + "\n" +
+// "==== Header 4 ====" + "\n" +
+// "===== Header 5 =====" + "\n" +
"=== {{header-template}} ===" + "\n";
final String expected = "Hi Hello <i>thad</i> you're \n" +
"# comment \n" +
"\n" +
"\n" +
+ " asdf\n" +
"# li\n" +
- " # li2\n" +
- " ## li2.2\n" +
- " Hi again. here's [[some]] wikitext. here's a \n" +
- "template:template\n" +
- " and some more text. \n" +
+ "# li2\n" +
+ "## li2.2\n" +
+ "\n" +
+ " Hi again. [[wikitext]]:[[wikitext]] here's [[some]] wikitext. here's a \n" +
+ "template:[template]{blah=2, blah2=3, blah3=3}\n" +
+ " and some more text.\n" +
"HEADER Header 2 \n" +
- " \n" +
+ "\n" +
"HEADER \n" +
- "template:header-template\n" +
+ "template:[header-template]{}\n" +
" \n" +
" ";
final PrintWikiCallback callback = new PrintWikiCallback();
}
@Override
- public void onTemplate(String[][] args) {
- builder.append("\ntemplate:").append(args[0][0]).append("\n");
+ public void onTemplate(final List<String> positionalArgs, final Map<String,String> namedArgs) {
+ builder.append("\ntemplate:").append(positionalArgs).append(namedArgs).append("\n");
}
@Override
@Override
public void onListItemStart(String header, int[] section) {
- builder.append(header);
+ builder.append("\n").append(header);
}
@Override
public void onListItemEnd(String header, int[] section) {
- builder.append("\n");
+ //builder.append("\n");
}
@Override
final int depth;
String language;
- String pronunciation;
+
+ final Map<String, StringBuilder> accentToPronunciation = new LinkedHashMap<String, StringBuilder>();
+ StringBuilder currentPronunciation = null;
boolean isLang1;
boolean isLang2;
static class PartOfSpeech {
final int depth;
+ final String name;
final List<Meaning> meaning = new ArrayList<WikiWord.Meaning>();
final Map<String, String> otherSections = new LinkedHashMap<String, String>();
- public PartOfSpeech(final int depth) {
+ public PartOfSpeech(final int depth, String name) {
this.depth = depth;
+ this.name = name;
}
}