X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=361473e80ac3e0043b9191fb1098ab3a1298c42f;hb=2bd62e0aab9c5ce70506cbd1b5de7b21feee1cf4;hp=34cf2d7436d36ffc99b315f232d81cb283ac1ff5;hpb=d46f529d02bf4306a922c521d032f7620020b1e8;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 34cf2d7..361473e 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -56,7 +56,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { private WiktionarySplitter() { List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { - //if (!code.equals("fr")) {continue;} + //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;} selectors = new ArrayList(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { @@ -103,6 +103,60 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (++pageCount % 1000 == 0) { System.out.println("endPage: " + title + ", count=" + pageCount); } + if (title.startsWith("Wiktionary:") || + title.startsWith("Appendix:") || + title.startsWith("Help:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Glossary:") || + title.startsWith("Rhymes:") || + title.startsWith("Category:") || + title.startsWith("Wikisaurus:") || + title.startsWith("Unsupported titles/") || + title.startsWith("Transwiki:") || + title.startsWith("File:") || + title.startsWith("Thread:") || + title.startsWith("Template:") || + title.startsWith("Summary:") || + // DE + title.startsWith("Datei:") || + title.startsWith("Verzeichnis:") || + title.startsWith("Vorlage:") || + title.startsWith("Thesaurus:") || + title.startsWith("Kategorie:") || + title.startsWith("Hilfe:") || + // FR: + title.startsWith("Annexe:") || + title.startsWith("Catégori:") || + title.startsWith("Modèle:") || + title.startsWith("Thésaurus:") || + title.startsWith("Projet:") || + title.startsWith("Aide:") || + title.startsWith("Fichier:") || + title.startsWith("Wiktionnaire:") || + title.startsWith("Catégorie:") || + title.startsWith("Portail:") || + title.startsWith("utiliusateur:") || + title.startsWith("Kategorio:") || + // IT + title.startsWith("Wikizionario:") || + title.startsWith("Appendice:") || + title.startsWith("Categoria:") || + title.startsWith("Aiuto:") || + title.startsWith("Portail:") || + + // sentinel + false + ) { + return; + } + if (title.contains(":")) { + if (!title.startsWith("Sign gloss:")) { + System.err.println("title with colon: " + title); + } + } String text = textBuilder.toString(); @@ -214,13 +268,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } } - public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); parser.parse(file, this); } - - }