X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=f5c85cf0f4f1e8cb41d500892f2bf52f64781c07;hb=7819736ae570bf597936f0dc640f60644da15fc8;hp=39addd58db7321d186503147cc6ff8bfc2e40f82;hpb=5fab504f765ff1553c98096ba85b04ffc2ef1062;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 39addd5..f5c85cf 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -56,7 +56,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { public Selector(final String filename, final String pattern) throws IOException { this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); - this.pattern = Pattern.compile(pattern); + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); } } @@ -79,27 +79,52 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (selectors.isEmpty()) { selectors.addAll(Arrays.asList( - new Selector("../DictionaryData/inputs/enWikiSplit/arabic.data", ".*[Ar]rabic.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/croation.data", ".*[Cc]roation.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/czech.data", ".*[Cc]zech.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/mandarin.data", ".*[Mm]andarin|[Cc]hinese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/dutch.data", ".*[Du]utch.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/english.data", ".*[Ee]nglish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/french.data", ".*[Ff]rench.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/german.data", ".*[Gg]erman.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/greek.data", ".*[Gg]reek.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/hindi.data", ".*[Hh]indi.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/italian.data", ".*[Ii]talian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/japanese.data", ".*[Jj]apanese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/korean.data", ".*[Kk]orean.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/persian.data", ".*[Pp]ersian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/portuguese.data", ".*[Pp]ortuguese.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/romanian.data", ".*[Rr]omanian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/russian.data", ".*[Rr]ussian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/spanish.data", ".*[Ss]panish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/swedish.data", ".*[Ss]wedish.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/thai.data", ".*[Tt]hai.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/vietnamese.data", ".*[Vv]ietnamese.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roatian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), + new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Cc]hinese.*|.*[Mm]andarin.*|.*Cantonese.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/DA.data", ".*[Dd]anish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Dd]utch.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]innish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/haw.data", ".*[Hh]awaiian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ebrew.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/PL.data", ".*[Pp]olish.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]krainian.*") +// new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), +// new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") )); } @@ -114,9 +139,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); + int pageCount = 0; private void endPage() { final String title = titleBuilder.toString(); - System.out.println("endPage: " + title); + if (++pageCount % 1000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } String text = textBuilder.toString();