package com.hughes.android.dictionary.engine;
+import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.InputStream;
import java.io.IOException;
+import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import javax.xml.parsers.SAXParser;
import org.apache.xerces.jaxp.SAXParserFactoryImpl;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
currentSelectors = pathToSelectorsEntry.getValue();
for (final Selector selector : currentSelectors) {
- selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
+ OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
+ tmp = new BufferedOutputStream(tmp);
+ tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
+ tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+ selector.out = new DataOutputStream(tmp);
}
// Do it.
try {
- parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+ File input = new File(pathToSelectorsEntry.getKey() + ".bz2");
+ if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz");
+ if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz");
+ if (!input.exists()) {
+ // Fallback to uncompressed file
+ parser.parse(new File(pathToSelectorsEntry.getKey()), this);
+ } else {
+ InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
+ InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
+ in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
+ parser.parse(new BufferedInputStream(in), this);
+ }
} catch (Exception e) {
System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
throw e;
String lastPageTitle = null;
int pageCount = 0;
+ Pattern endPatterns[] = new Pattern[100];
+
+ private Pattern getEndPattern(int depth) {
+ if (endPatterns[depth] == null)
+ endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+ return endPatterns[depth];
+ }
+
private void endPage() {
final String title = titleBuilder.toString();
lastPageTitle = title;
if (heading.indexOf("Translingual") != -1) {
// Find end.
final int depth = startMatcher.group(1).length();
- final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+ final Pattern endPattern = getEndPattern(depth);
final Matcher endMatcher = endPattern.matcher(text);
if (endMatcher.find()) {
// Find end.
final int depth = startMatcher.group(1).length();
- final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+ final Pattern endPattern = getEndPattern(depth);
final Matcher endMatcher = endPattern.matcher(text);
final int end;