X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=628d3567e7a1ad030a5e828599e8458088501db0;hb=1aa4de25c859304d21acfadd18cb546d1c21415b;hp=83cb043cbf456c03c66f36a26888c3759ff89dd3;hpb=a8052a74747df9244c098041dc82c745f64d51c6;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 83cb043..628d356 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -20,8 +20,8 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -32,8 +32,12 @@ import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import com.hughes.android.dictionary.parser.enwiktionary.EnWiktionaryLangs; + public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { + private static final String FILE_TO_SPLIT = "data/inputs/enwiktionary-pages-articles.xml"; + static class Section implements java.io.Serializable { private static final long serialVersionUID = -7676549898325856822L; @@ -56,7 +60,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { public Selector(final String filename, final String pattern) throws IOException { this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); - this.pattern = Pattern.compile(pattern); + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); } } @@ -70,7 +74,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); // Configure things. - final File file = new File(args[0]); + final List selectors = wiktionarySplitter.selectors; for (int i = 1; i < args.length; i += 2) { final Selector selector = new Selector(args[i], args[i+1]); @@ -78,33 +82,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } if (selectors.isEmpty()) { - selectors.addAll(Arrays.asList( - new Selector("wikiSplit/arabic.data", ".*[Ar]rabic.*"), - new Selector("wikiSplit/croation.data", ".*[Cc]roation.*"), - new Selector("wikiSplit/czech.data", ".*[Cc]zech.*"), - new Selector("wikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("wikiSplit/dutch.data", ".*[Du]utch.*"), - new Selector("wikiSplit/english.data", ".*[Ee]nglish.*"), - new Selector("wikiSplit/french.data", ".*[Ff]rench.*"), - new Selector("wikiSplit/german.data", ".*[Gg]erman.*"), - new Selector("wikiSplit/greek.data", ".*[Gg]reek.*"), - new Selector("wikiSplit/hindi.data", ".*[Hh]indi.*"), - new Selector("wikiSplit/italian.data", ".*[Ii]talian.*"), - new Selector("wikiSplit/japanese.data", ".*[Jj]apanese.*"), - new Selector("wikiSplit/korean.data", ".*[Kk]orean.*"), - new Selector("wikiSplit/persian.data", ".*[Pp]ersian.*"), - new Selector("wikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), - new Selector("wikiSplit/romanian.data", ".*[Rr]omanian.*"), - new Selector("wikiSplit/russian.data", ".*[Rr]ussian.*"), - new Selector("wikiSplit/spanish.data", ".*[Ss]panish.*"), - new Selector("wikiSplit/swedish.data", ".*[Ss]wedish.*"), - new Selector("wikiSplit/thai.data", ".*[Tt]hai.*"), - new Selector("wikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") - )); + for (final Map.Entry entry : EnWiktionaryLangs.isoCodeToWikiName.entrySet()) { + selectors.add(new Selector(String.format("data/inputs/enWikiSplit/%s.data", entry.getKey()), entry.getValue())); + } } // Do it. - parser.parse(file, wiktionarySplitter); + parser.parse(new File(FILE_TO_SPLIT), wiktionarySplitter); // Shutdown. for (final Selector selector : selectors) { @@ -114,9 +98,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); + int pageCount = 0; private void endPage() { final String title = titleBuilder.toString(); - System.out.println("endPage: " + title); + if (++pageCount % 1000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } String text = textBuilder.toString();