From: Reimar Döffinger Date: Sat, 7 Oct 2017 19:48:29 +0000 (+0200) Subject: WiktionarySplitter: Support compressed inputs. X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=commitdiff_plain;h=ee1dbfb669462305a1c07e4d804a90af79f5d39f WiktionarySplitter: Support compressed inputs. Unfortunately bzip2 decompression is very slow (slower than the XML parsing in fact), so it might make sense to re-compress the downloaded files from bzip2 to xz. If the decompression could be done in a separate thread, xz compression would even provide a speedup if the files are on a slower (non-SSD) disk. --- diff --git a/WiktionarySplitter.sh b/WiktionarySplitter.sh index a2a9680..57b16cb 100755 --- a/WiktionarySplitter.sh +++ b/WiktionarySplitter.sh @@ -4,6 +4,7 @@ ICU4J=/usr/share/java/icu4j-49.1.jar test -r "$ICU4J" || ICU4J=/usr/share/icu4j-55/lib/icu4j.jar XERCES=/usr/share/java/xercesImpl.jar test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar +COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java test -x "$JAVA" || JAVA=java -"$JAVA" -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES" com.hughes.android.dictionary.engine.WiktionarySplitter "$@" +"$JAVA" -classpath src:../Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.WiktionarySplitter "$@" diff --git a/compile.sh b/compile.sh index 7a67e88..5129fc9 100755 --- a/compile.sh +++ b/compile.sh @@ -6,6 +6,7 @@ XERCES=/usr/share/java/xercesImpl.jar test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar COMMONS=/usr/share/java/commons-lang3.jar test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar +COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar if [ ! -x ../Dictionary ] ; then echo "You need to clone the Dictionary repository (including subprojects) into .." exit 1 @@ -26,4 +27,8 @@ if [ ! -r "$COMMONS" ] ; then echo "commons-lang needs to be installed" exit 1; fi -javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS" +if [ ! -r "$COMMONS_COMPRESS" ] ; then + echo "commons-compress needs to be installed" + exit 1; +fi +javac -g ../Dictionary/Util/src/com/hughes/util/*.java ../Dictionary/Util/src/com/hughes/util/raf/*.java ../Dictionary/src/com/hughes/android/dictionary/DictionaryInfo.java ../Dictionary/src/com/hughes/android/dictionary/engine/*.java ../Dictionary/src/com/hughes/android/dictionary/C.java src/com/hughes/android/dictionary/*.java src/com/hughes/android/dictionary/*/*.java src/com/hughes/android/dictionary/*/*/*.java -classpath "$ICU4J:$JUNIT:$XERCES:$COMMONS:$COMMONS_COMPRESS" diff --git a/data/downloadInputs.sh b/data/downloadInputs.sh index b5640b7..0929f63 100755 --- a/data/downloadInputs.sh +++ b/data/downloadInputs.sh @@ -10,49 +10,42 @@ L=en echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 echo "Downloading from: http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/" CHEMNITZ=de-en.txt curl -L --remote-name http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en-devel/${CHEMNITZ}.gz -gunzip ${CHEMNITZ}.gz -mv ${CHEMNITZ} inputs/de-en_chemnitz.txt +mv ${CHEMNITZ}.gz inputs/de-en_chemnitz.txt.gz L=fr echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 --force ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 L=it echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 L=de echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 L=es echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 L=pt echo "Downloading from: http://dumps.wikimedia.org/${L}wiktionary/" WIKI=${L}wiktionary-latest-pages-articles.xml curl -L --remote-name http://dumps.wikimedia.org/${L}wiktionary/latest/${WIKI}.bz2 -bunzip2 ${WIKI}.bz2 -mv ${WIKI} inputs/${L}wiktionary-pages-articles.xml +mv ${WIKI}.bz2 inputs/${L}wiktionary-pages-articles.xml.bz2 echo "Done. Now run WiktionarySplitter to split apart enwiktionary." diff --git a/run.sh b/run.sh index 72b36f8..40edcf2 100755 --- a/run.sh +++ b/run.sh @@ -5,6 +5,7 @@ XERCES=/usr/share/java/xercesImpl.jar test -r "$XERCES" || XERCES=/usr/share/xerces-2/lib/xercesImpl.jar COMMONS=/usr/share/java/commons-lang3.jar test -r "$COMMONS" || COMMONS=/usr/share/commons-lang-3.3/lib/commons-lang.jar +COMMONS_COMPRESS=/usr/share/java/commons-compress-1.13.jar JAVA=/usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java test -x "$JAVA" || JAVA=java -"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@" +"$JAVA" -Djava.util.logging.config.file="logging.properties" -Xmx4096m -classpath src:../Dictionary/Util/src/:../Dictionary/src/:"$ICU4J":"$XERCES":"$COMMONS":"$COMMONS_COMPRESS" com.hughes.android.dictionary.engine.DictionaryBuilder "$@" diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 6839904..290a58f 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -14,10 +14,13 @@ package com.hughes.android.dictionary.engine; +import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.InputStream; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; @@ -30,6 +33,7 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import org.apache.xerces.jaxp.SAXParserFactoryImpl; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -83,7 +87,17 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { // Do it. try { - parser.parse(new File(pathToSelectorsEntry.getKey()), this); + File input = new File(pathToSelectorsEntry.getKey() + ".bz2"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz"); + if (!input.exists()) { + // Fallback to uncompressed file + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + } else { + InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); + InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + parser.parse(new BufferedInputStream(in), this); + } } catch (Exception e) { System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); throw e;