]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
Compress WiktionarySplitter output files.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index 6839904516abd6293c9cd6f6dcedc546ed39ecc7..0624025387b7f134242de98736ec4082e4915029 100644 (file)
 
 package com.hughes.android.dictionary.engine;
 
+import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.InputStream;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -30,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 
 import org.apache.xerces.jaxp.SAXParserFactoryImpl;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
@@ -78,12 +83,27 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
             currentSelectors = pathToSelectorsEntry.getValue();
 
             for (final Selector selector : currentSelectors) {
-                selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
+                OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
+                tmp = new BufferedOutputStream(tmp);
+                tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
+                tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+                selector.out = new DataOutputStream(tmp);
             }
 
             // Do it.
             try {
-                parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+                File input = new File(pathToSelectorsEntry.getKey() + ".bz2");
+                if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz");
+                if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz");
+                if (!input.exists()) {
+                    // Fallback to uncompressed file
+                    parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+                } else {
+                    InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
+                    InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
+                    in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
+                    parser.parse(new BufferedInputStream(in), this);
+                }
             } catch (Exception e) {
                 System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
                 throw e;
@@ -99,6 +119,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     String lastPageTitle = null;
     int pageCount = 0;
+    Pattern endPatterns[] = new Pattern[100];
+
+    private Pattern getEndPattern(int depth) {
+        if (endPatterns[depth] == null)
+            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+        return endPatterns[depth];
+    }
+
     private void endPage() {
         final String title = titleBuilder.toString();
         lastPageTitle = title;
@@ -185,7 +213,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 if (heading.indexOf("Translingual") != -1) {
                     // Find end.
                     final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+                    final Pattern endPattern = getEndPattern(depth);
 
                     final Matcher endMatcher = endPattern.matcher(text);
                     if (endMatcher.find()) {
@@ -199,7 +227,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
                     // Find end.
                     final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+                    final Pattern endPattern = getEndPattern(depth);
 
                     final Matcher endMatcher = endPattern.matcher(text);
                     final int end;