X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=361473e80ac3e0043b9191fb1098ab3a1298c42f;hb=2bd62e0aab9c5ce70506cbd1b5de7b21feee1cf4;hp=685b23867d008846dfcedf67611b45cb265c6fea;hpb=eeb5667c56b2074b7eeac531589c9f1bf55ba738;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 685b238..361473e 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -1,3 +1,17 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package com.hughes.android.dictionary.engine; import java.io.BufferedOutputStream; @@ -6,103 +20,143 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; +import org.apache.xerces.jaxp.SAXParserFactoryImpl; import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; + public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { - - static class Section implements java.io.Serializable { - private static final long serialVersionUID = -7676549898325856822L; - final String title; - final String heading; - final String text; - - public Section(final String title, final String heading, final String text) { - this.title = title; - this.heading = heading; - this.text = text; - - //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); - } - } + // The matches the whole line, otherwise regexes don't work well on French: + // {{=uk=}} + static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + + final Map> pathToSelectors = new LinkedHashMap>(); + List currentSelectors = null; - static class Selector { - DataOutputStream out; - Pattern pattern; - - public Selector(final String filename, final String pattern) throws IOException { - this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); - this.pattern = Pattern.compile(pattern); - } - } - - final List selectors = new ArrayList(); StringBuilder titleBuilder; StringBuilder textBuilder; StringBuilder currentBuilder = null; - public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + public static void main(final String[] args) throws Exception { final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - - // Configure things. - final File file = new File(args[0]); - final List selectors = wiktionarySplitter.selectors; - for (int i = 1; i < args.length; i += 2) { - final Selector selector = new Selector(args[i], args[i+1]); - selectors.add(selector); + wiktionarySplitter.go(); + } + + private WiktionarySplitter() { + List selectors; + for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { + //if (code.equals("en") || code.equals("de") || code.equals("fr")) {continue;} + selectors = new ArrayList(); + pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); + for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { + final String dir = String.format("data/inputs/wikiSplit/%s", code); + new File(dir).mkdirs(); + selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); + } } + } - if (selectors.isEmpty()) { - selectors.addAll(Arrays.asList( - new Selector("wikiSplit/arabic.data", ".*[Ar]rabic.*"), - new Selector("wikiSplit/croation.data", ".*[Cc]roation.*"), - new Selector("wikiSplit/czech.data", ".*[Cc]zech.*"), - new Selector("wikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("wikiSplit/dutch.data", ".*[Du]utch.*"), - new Selector("wikiSplit/english.data", ".*[Ee]nglish.*"), - new Selector("wikiSplit/french.data", ".*[Ff]rench.*"), - new Selector("wikiSplit/german.data", ".*[Gg]erman.*"), - new Selector("wikiSplit/greek.data", ".*[Gg]reek.*"), - new Selector("wikiSplit/hindi.data", ".*[Hh]indi.*"), - new Selector("wikiSplit/italian.data", ".*[Ii]talian.*"), - new Selector("wikiSplit/japanese.data", ".*[Jj]apanese.*"), - new Selector("wikiSplit/korean.data", ".*[Kk]orean.*"), - new Selector("wikiSplit/persian.data", ".*[Pp]ersian.*"), - new Selector("wikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), - new Selector("wikiSplit/romanian.data", ".*[Rr]omanian.*"), - new Selector("wikiSplit/russian.data", ".*[Rr]ussian.*"), - new Selector("wikiSplit/spanish.data", ".*[Ss]panish.*"), - new Selector("wikiSplit/swedish.data", ".*[Ss]wedish.*"), - new Selector("wikiSplit/thai.data", ".*[Tt]hai.*"), - new Selector("wikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") - )); - } - - // Do it. - parser.parse(file, wiktionarySplitter); - - // Shutdown. - for (final Selector selector : selectors) { - selector.out.close(); + private void go() throws Exception { + final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + + // Configure things. + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + + currentSelectors = pathToSelectorsEntry.getValue(); + + for (final Selector selector : currentSelectors) { + selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); + } + + // Do it. + try { + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + } catch (Exception e) { + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); + throw e; + } + + // Shutdown. + for (final Selector selector : currentSelectors) { + selector.out.close(); + } + } } - static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); - + String lastPageTitle = null; + int pageCount = 0; private void endPage() { final String title = titleBuilder.toString(); - System.out.println("endPage: " + title); + lastPageTitle = title; + if (++pageCount % 1000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } + if (title.startsWith("Wiktionary:") || + title.startsWith("Appendix:") || + title.startsWith("Help:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Glossary:") || + title.startsWith("Rhymes:") || + title.startsWith("Category:") || + title.startsWith("Wikisaurus:") || + title.startsWith("Unsupported titles/") || + title.startsWith("Transwiki:") || + title.startsWith("File:") || + title.startsWith("Thread:") || + title.startsWith("Template:") || + title.startsWith("Summary:") || + // DE + title.startsWith("Datei:") || + title.startsWith("Verzeichnis:") || + title.startsWith("Vorlage:") || + title.startsWith("Thesaurus:") || + title.startsWith("Kategorie:") || + title.startsWith("Hilfe:") || + // FR: + title.startsWith("Annexe:") || + title.startsWith("Catégori:") || + title.startsWith("Modèle:") || + title.startsWith("Thésaurus:") || + title.startsWith("Projet:") || + title.startsWith("Aide:") || + title.startsWith("Fichier:") || + title.startsWith("Wiktionnaire:") || + title.startsWith("Catégorie:") || + title.startsWith("Portail:") || + title.startsWith("utiliusateur:") || + title.startsWith("Kategorio:") || + // IT + title.startsWith("Wikizionario:") || + title.startsWith("Appendice:") || + title.startsWith("Categoria:") || + title.startsWith("Aiuto:") || + title.startsWith("Portail:") || + + // sentinel + false + ) { + return; + } + if (title.contains(":")) { + if (!title.startsWith("Sign gloss:")) { + System.err.println("title with colon: " + title); + } + } String text = textBuilder.toString(); @@ -115,12 +169,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { text = text.substring(startMatcher.end()); final String heading = startMatcher.group(); - for (final Selector selector : selectors) { + for (final Selector selector : currentSelectors) { if (selector.pattern.matcher(heading).find()) { // Find end. final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=]+=+", depth), Pattern.MULTILINE); + final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); final Matcher endMatcher = endPattern.matcher(text); final int end; @@ -150,6 +204,36 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } + // ----------------------------------------------------------------------- + + static class Section implements java.io.Serializable { + private static final long serialVersionUID = -7676549898325856822L; + + final String title; + final String heading; + final String text; + + public Section(final String title, final String heading, final String text) { + this.title = title; + this.heading = heading; + this.text = text; + + //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + } + } + + static class Selector { + final String outFilename; + final Pattern pattern; + + DataOutputStream out; + + public Selector(final String filename, final String pattern) { + this.outFilename = filename; + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + } + } + // ----------------------------------------------------------------------- @Override @@ -184,10 +268,9 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } } - public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); parser.parse(file, this); }