From: thadh Date: Mon, 10 Sep 2012 22:05:02 +0000 (-0700) Subject: Add some langs (Ancient Greek, Cantonese, Burmese(MY)), WholeSection X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=0a97e208339fec580ff115d3c12c1264f6b66a8a Add some langs (Ancient Greek, Cantonese, Burmese(MY)), WholeSection parser improvements, Splitter improvements. Builder uses WholeSection parser. --- diff --git a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java index 584752f..dfc4abb 100644 --- a/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java +++ b/src/com/hughes/android/dictionary/engine/DictionaryBuilderMain.java @@ -14,6 +14,13 @@ package com.hughes.android.dictionary.engine; +import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser; +import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser; +import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; + +import junit.framework.TestCase; + +import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedHashMap; @@ -22,12 +29,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import junit.framework.TestCase; - -import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser; -import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser; -import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; - public class DictionaryBuilderMain extends TestCase { static final String INPUTS = "data/inputs/"; @@ -67,6 +68,12 @@ public class DictionaryBuilderMain extends TestCase { return isoToStoplist.containsKey(iso) ? isoToStoplist.get(iso) : "empty.txt"; } + static String getOtherLang(final String[] pair, final String first) { + assert Arrays.asList(pair).contains(first); + assert pair.length == 2; + return pair[0].equals(first) ? pair[1] : pair[0]; + } + static List getMainArgs(final String[] pair) { final List result = new ArrayList(); @@ -80,12 +87,29 @@ public class DictionaryBuilderMain extends TestCase { result.add(String.format("--lang1Stoplist=%s", STOPLISTS + getStoplist(lang1))); result.add(String.format("--lang2Stoplist=%s", STOPLISTS + getStoplist(lang2))); - int i = 2; + int i = 1; + + // For a few langs, put the defs of the other language in DE/IT/FR using WholeSection. + for (final String wikitionaryLang : Arrays.asList("EN", "DE", "IT", "FR")) { + if (!Arrays.asList(pair).contains(wikitionaryLang)) { + continue; + } + final String foreignIso = getOtherLang(pair, wikitionaryLang); + final String wikiSplitFile = String.format("%s/wikiSplit/%s/%s.data", INPUTS, wikitionaryLang.toLowerCase(), foreignIso); + if (!new File(wikiSplitFile).canRead()) { + System.err.println("Can't read file: " + wikiSplitFile); + continue; + } + result.add(String.format("--input%d=%s", i, wikiSplitFile)); + result.add(String.format("--input%dName=%sWiktionary.WholeSections.%s", i, wikitionaryLang, foreignIso)); + result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME)); + result.add(String.format("--input%dTitleIndex=%d", i, Arrays.asList(pair).indexOf(foreignIso) + 1)); + ++i; + } // Deal with the pairs where one is English. if (Arrays.asList(pair).contains("EN")) { - final String foreignIso = pair[0].equals("EN") ? pair[1] : pair[0]; - + final String foreignIso = getOtherLang(pair, "EN"); String foreignRegex = WiktionaryLangs.isoCodeToEnWikiName.get(foreignIso); if (foreignIso.equals("ZH")) { @@ -93,20 +117,9 @@ public class DictionaryBuilderMain extends TestCase { foreignRegex = "Chinese|Mandarin|Cantones"; } - final int enIndex; - if (foreignIso.equals("DE")) { - // German-English is a special case since it was the first ever QuickDic! - result.add(String.format("--lang1=%s", "DE")); - result.add(String.format("--lang2=%s", "EN")); - result.add("--dictInfo=@" + INPUTS + "de-en_chemnitz_enwiktionary.info"); - - enIndex = 2; - } else { - result.add(String.format("--lang1=%s", "EN")); - result.add(String.format("--lang2=%s", foreignIso)); - result.add(String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.%s", foreignIso, getDedication(foreignIso))); - enIndex = 1; - } + result.add(String.format("--lang1=%s", "EN")); + result.add(String.format("--lang2=%s", foreignIso)); + result.add(String.format("--dictInfo=(EN)Wikitionary-based EN-%s dictionary.%s", foreignIso, getDedication(foreignIso))); result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso)); result.add(String.format("--input%dName=ENWiktionary.%s", i, foreignIso)) ; @@ -114,7 +127,7 @@ public class DictionaryBuilderMain extends TestCase { result.add(String.format("--input%dWiktionaryType=EnForeign", i)); result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); - result.add(String.format("--input%dEnIndex=%d", i, enIndex)); + result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); ++i; result.add(String.format("--input%d=%swikiSplit/en/EN.data", i, INPUTS)); @@ -123,7 +136,7 @@ public class DictionaryBuilderMain extends TestCase { result.add(String.format("--input%dWiktionaryType=EnToTranslation", i)); result.add(String.format("--input%dLangPattern=%s", i, foreignRegex)); result.add(String.format("--input%dLangCodePattern=%s", i, foreignIso.toLowerCase())); - result.add(String.format("--input%dEnIndex=%d", i, enIndex)); + result.add(String.format("--input%dEnIndex=%d", i, Arrays.asList(pair).indexOf("EN") + 1)); ++i; if (foreignIso.equals("DE")) { @@ -134,12 +147,6 @@ public class DictionaryBuilderMain extends TestCase { ++i; } - result.add(String.format("--input%d=%s/wikiSplit/en/%s.data", i, INPUTS, foreignIso)); - result.add(String.format("--input%dName=%s", i, "ENWiktionary.WholeSections.%s", foreignIso)); - result.add(String.format("--input%dFormat=%s", i, WholeSectionToHtmlParser.NAME)); - result.add(String.format("--input%dTitleIndex=%d", i, 3 - enIndex)); - ++i; - } else { // Pairs without English. result.add(String.format("--lang1=%s", lang1)); @@ -152,6 +159,7 @@ public class DictionaryBuilderMain extends TestCase { result.add(String.format("--input%dLangPattern2=%s", i, lang2)); ++i; } + return result; } diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 34cf2d7..74850bf 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -56,7 +56,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { private WiktionarySplitter() { List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { - //if (!code.equals("fr")) {continue;} + //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;} selectors = new ArrayList(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { @@ -103,6 +103,63 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (++pageCount % 1000 == 0) { System.out.println("endPage: " + title + ", count=" + pageCount); } + if (title.startsWith("Wiktionary:") || + title.startsWith("Appendix:") || + title.startsWith("Help:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Glossary:") || + title.startsWith("Rhymes:") || + title.startsWith("Category:") || + title.startsWith("Wikisaurus:") || + title.startsWith("Unsupported titles/") || + title.startsWith("Transwiki:") || + title.startsWith("File:") || + title.startsWith("Thread:") || + title.startsWith("Template:") || + title.startsWith("Summary:") || + // DE + title.startsWith("Datei:") || + title.startsWith("Verzeichnis:") || + title.startsWith("Vorlage:") || + title.startsWith("Thesaurus:") || + title.startsWith("Kategorie:") || + title.startsWith("Hilfe:") || + // FR: + title.startsWith("Annexe:") || + title.startsWith("Catégori:") || + title.startsWith("Modèle:") || + title.startsWith("Thésaurus:") || + title.startsWith("Projet:") || + title.startsWith("Aide:") || + title.startsWith("Fichier:") || + title.startsWith("Wiktionnaire:") || + title.startsWith("Catégorie:") || + title.startsWith("Portail:") || + title.startsWith("utiliusateur:") || + title.startsWith("Kategorio:") || + + + + // IT + title.startsWith("Wikizionario:") || + title.startsWith("Appendice:") || + title.startsWith("Categoria:") || + title.startsWith("Aiuto:") || + title.startsWith("Portail:") || + + // sentinel + false + ) { + return; + } + if (title.contains(":")) { + if (!title.startsWith("Sign gloss:")) { + System.err.println("title with colon: " + title); + } + } String text = textBuilder.toString(); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 87950c6..fd3f44f 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -26,7 +26,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override void parseSection(String heading, String text) { - HtmlEntry htmlEntry = new HtmlEntry(entrySource, title); + HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback(this); diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java index 97bfce0..65f1442 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WiktionaryLangs.java @@ -30,10 +30,12 @@ public class WiktionaryLangs { isoCodeToEnWikiName.put("BN", "Bengali"); isoCodeToEnWikiName.put("BS", "Bosnian"); isoCodeToEnWikiName.put("BG", "Bulgarian"); + isoCodeToEnWikiName.put("MY", "Burmese"); + isoCodeToEnWikiName.put("yue", "Cantonese"); isoCodeToEnWikiName.put("CA", "Catalan"); isoCodeToEnWikiName.put("HR", "Croatian"); isoCodeToEnWikiName.put("CS", "Czech"); - isoCodeToEnWikiName.put("ZH", "Chinese|Mandarin|Cantonese"); + isoCodeToEnWikiName.put("ZH", "Chinese|Mandarin"); isoCodeToEnWikiName.put("DA", "Danish"); isoCodeToEnWikiName.put("NL", "Dutch"); isoCodeToEnWikiName.put("EN", "English"); @@ -43,6 +45,7 @@ public class WiktionaryLangs { isoCodeToEnWikiName.put("FR", "French"); isoCodeToEnWikiName.put("DE", "German"); isoCodeToEnWikiName.put("EL", "Greek"); + isoCodeToEnWikiName.put("grc", "Ancient Greek"); isoCodeToEnWikiName.put("haw", "Hawaiian"); isoCodeToEnWikiName.put("HE", "Hebrew"); isoCodeToEnWikiName.put("HI", "Hindi"); diff --git a/todo.txt b/todo.txt index 3587e21..b340add 100644 --- a/todo.txt +++ b/todo.txt @@ -1,11 +1,11 @@ -* HtmlEntries - - Add them to the dictionary's list. +HtmlEntry + - Add links into the HtmlEntry based on wikilinks. + - Build single EN/DE/IT/FR dictionaries based on HtmlEntry. + - "See also" link entries for cross-referencing ("form of"--strong, links to token, "mentioned in"--weaker, links to HtmlEntry). - Link to them from the appropriate places: IndexEntry (first), and individual rows (tricker, built at different times). make sure word is sticky when you change dictionaries. -get rid of Appendix:.... sections from EN.data in split. - - on small device it would be great to be able to hide the system status bar and the title bar - an history list of the searched words per dictionary with the possibility of having a rudimentary flash card game from it to memorise new words - space between clear text button and language button is to big (my screen is 320x240, Galaxy Mini) @@ -103,4 +103,6 @@ better tokenization? publish 2.0 dictionary test email dict manager +get rid of Appendix:.... sections from EN.data in split. + \ No newline at end of file