]> gitweb.fperrin.net Git - DictionaryPC.git/commitdiff
WiktionarySplitter: Support compressed inputs.
authorReimar Döffinger <Reimar.Doeffinger@gmx.de>
Sat, 7 Oct 2017 19:48:29 +0000 (21:48 +0200)
committerReimar Döffinger <Reimar.Doeffinger@gmx.de>
Sat, 7 Oct 2017 19:48:29 +0000 (21:48 +0200)
Unfortunately bzip2 decompression is very slow (slower
than the XML parsing in fact), so it might make sense to
re-compress the downloaded files from bzip2 to xz.
If the decompression could be done in a separate thread,
xz compression would even provide a speedup if the files
are on a slower (non-SSD) disk.

WiktionarySplitter.sh
compile.sh
data/downloadInputs.sh
run.sh
src/com/hughes/android/dictionary/engine/WiktionarySplitter.java

index a2a968042a5cdbda7cf7de88825b579ed436f8af..57b16cbbe95fb81056eabc58cc2251a0cfd93865 100755 (executable)
@@ -4,6 +4,7 @@ ICU4J=/usr/share/java/icu4j-49.1.jar
 test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
 XERCES=/usr/share/java/xercesImpl.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
 test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar
 XERCES=/usr/share/java/xercesImpl.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
+COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar
 JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
 test -x "$JAVA" || JAVA=java
 JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
 test -x "$JAVA" || JAVA=java
-"$JAVA" -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES" com.hughes.android.dictionary.engine.WiktionarySplitter "$@"
+"$JAVA" -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.WiktionarySplitter "$@"
index 7a67e88bddb0088f1635e166c2b9c608cf46af73..5129fc957933edd5b265aee405ff0d93bd09c1b0 100755 (executable)
@@ -6,6 +6,7 @@ XERCES=/usr/share/java/xercesImpl.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
 COMMONS=/usr/share/java/commons-lang3.jar
 test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
 COMMONS=/usr/share/java/commons-lang3.jar
 test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
+COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar
 if [ ! -x ../Dictionary ] ; then
     echo "You need to clone the Dictionary repository (including subprojects) into .."
     exit 1
 if [ ! -x ../Dictionary ] ; then
     echo "You need to clone the Dictionary repository (including subprojects) into .."
     exit 1
@@ -26,4 +27,8 @@ if [ ! -r "$COMMONS" ] ; then
     echo "commons-lang needs to be installed"
     exit 1;
 fi
     echo "commons-lang needs to be installed"
     exit 1;
 fi
-javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS"
+if [ ! -r "$COMMONS_COMPRESS" ] ; then
+    echo "commons-compress needs to be installed"
+    exit 1;
+fi
+javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS:$COMMONS_COMPRESS"
index b5640b7778928fd1c1519fe42a65799f75fb89ad..0929f63687146349e7ae8a5511183e31eeba9acb 100755 (executable)
@@ -10,49 +10,42 @@ L=en
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/"
 CHEMNITZ=de-en.txt
 curl -L --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz
 
 echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/"
 CHEMNITZ=de-en.txt
 curl -L --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz
-gunzip ${CHEMNITZ}.gz
-mv ${CHEMNITZ} inputs/de-en_chemnitz.txt
+mv ${CHEMNITZ}.gz inputs/de-en_chemnitz.txt.gz
 
 L=fr
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 
 L=fr
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 --force ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 L=it
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 
 L=it
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 L=de
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 
 L=de
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 L=es
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 
 L=es
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 L=pt
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
 
 L=pt
 echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/"
 WIKI=${L}wiktionary-latest-pages-articles.xml
 curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2
-bunzip2 ${WIKI}.bz2
-mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml
+mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2
 
 echo "Done.  Now run WiktionarySplitter to split apart enwiktionary."
 
 
 echo "Done.  Now run WiktionarySplitter to split apart enwiktionary."
 
diff --git a/run.sh b/run.sh
index 72b36f85b36730b67de6319491e023f729569b66..40edcf26eb491e29f986e3394946d1d9b6525170 100755 (executable)
--- a/run.sh
+++ b/run.sh
@@ -5,6 +5,7 @@ XERCES=/usr/share/java/xercesImpl.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
 COMMONS=/usr/share/java/commons-lang3.jar
 test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
 test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar
 COMMONS=/usr/share/java/commons-lang3.jar
 test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar
+COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar
 JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
 test -x "$JAVA" || JAVA=java
 JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
 test -x "$JAVA" || JAVA=java
-"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@"
+"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@"
index 6839904516abd6293c9cd6f6dcedc546ed39ecc7..290a58fccc1e38a6c36acdd44f2cb08cf42abb40 100644 (file)
 
 package com.hughes.android.dictionary.engine;
 
 
 package com.hughes.android.dictionary.engine;
 
+import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
 import java.io.File;
 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.FileOutputStream;
+import java.io.InputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
@@ -30,6 +33,7 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 
 import org.apache.xerces.jaxp.SAXParserFactoryImpl;
 import javax.xml.parsers.SAXParser;
 
 import org.apache.xerces.jaxp.SAXParserFactoryImpl;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 
@@ -83,7 +87,17 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
             // Do it.
             try {
 
             // Do it.
             try {
-                parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+                File input = new File(pathToSelectorsEntry.getKey() + ".bz2");
+                if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz");
+                if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz");
+                if (!input.exists()) {
+                    // Fallback to uncompressed file
+                    parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+                } else {
+                    InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
+                    InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
+                    parser.parse(new BufferedInputStream(in), this);
+                }
             } catch (Exception e) {
                 System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
                 throw e;
             } catch (Exception e) {
                 System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
                 throw e;