X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=097643f1af7e3762cb1d76b16e0a44c4e0f3e240;hb=863fc804a6496c920a2b2045913c45f938bb646c;hp=6839904516abd6293c9cd6f6dcedc546ed39ecc7;hpb=e479ba38bbcb261951399326623c20ffacc147d4;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 6839904..097643f 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -14,11 +14,15 @@ package com.hughes.android.dictionary.engine; +import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.InputStream; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -30,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import org.apache.xerces.jaxp.SAXParserFactoryImpl; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -78,12 +83,25 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { currentSelectors = pathToSelectorsEntry.getValue(); for (final Selector selector : currentSelectors) { - selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); + OutputStream tmp = new FileOutputStream(selector.outFilename); + tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + selector.out = new DataOutputStream(tmp); } // Do it. try { - parser.parse(new File(pathToSelectorsEntry.getKey()), this); + File input = new File(pathToSelectorsEntry.getKey() + ".bz2"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz"); + if (!input.exists()) { + // Fallback to uncompressed file + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + } else { + InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); + InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); + parser.parse(new BufferedInputStream(in), this); + } } catch (Exception e) { System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); throw e; @@ -99,6 +117,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { String lastPageTitle = null; int pageCount = 0; + Pattern endPatterns[] = new Pattern[100]; + + private Pattern getEndPattern(int depth) { + if (endPatterns[depth] == null) + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + return endPatterns[depth]; + } + private void endPage() { final String title = titleBuilder.toString(); lastPageTitle = title; @@ -185,7 +211,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { if (heading.indexOf("Translingual") != -1) { // Find end. final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + final Pattern endPattern = getEndPattern(depth); final Matcher endMatcher = endPattern.matcher(text); if (endMatcher.find()) { @@ -199,7 +225,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { // Find end. final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + final Pattern endPattern = getEndPattern(depth); final Matcher endMatcher = endPattern.matcher(text); final int end;