X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FAbstractWiktionaryParser.java;h=3fe8089f3279eb5a7985533c3a4b92f2d87385c5;hb=21a62b8ba843ed718721c08956d3eff7049e636b;hp=b77c341649d5f63a0a11ef9a8f1fb1fa1d71fee1;hpb=e479ba38bbcb261951399326623c20ffacc147d4;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index b77c341..3fe8089 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -19,6 +19,7 @@ import java.io.DataInputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; +import java.io.InputStream; import java.io.IOException; import java.util.LinkedHashMap; import java.util.LinkedHashSet; @@ -30,12 +31,17 @@ import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.commons.compress.compressors.CompressorException; + import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.ReadAheadBuffer; import com.hughes.android.dictionary.parser.Parser; import com.hughes.android.dictionary.parser.WikiTokenizer; import com.hughes.util.EnumUtil; @@ -44,6 +50,8 @@ public abstract class AbstractWiktionaryParser implements Parser { static final Logger LOG = Logger.getLogger("WiktionaryParser"); + private static final Pattern SUPERSCRIPT = Pattern.compile("[0-9]*"); + final SortedMap counters = new TreeMap(); final Set pairsAdded = new LinkedHashSet(); @@ -55,11 +63,55 @@ public abstract class AbstractWiktionaryParser implements Parser { abstract void removeUselessArgs(final Map namedArgs); + private static String replaceSuperscript(String in) { + Matcher matcher; + while ((matcher = SUPERSCRIPT.matcher(in)).find()) { + String replace = ""; + String orig = matcher.group(); + for (int i = 5; i < orig.length() - 6; i++) + { + char c = 0; + switch (orig.charAt(i)) { + case '0': c = '\u2070'; break; + case '1': c = '\u00b9'; break; + case '2': c = '\u00b2'; break; + case '3': c = '\u00b3'; break; + case '4': c = '\u2074'; break; + case '5': c = '\u2075'; break; + case '6': c = '\u2076'; break; + case '7': c = '\u2077'; break; + case '8': c = '\u2078'; break; + case '9': c = '\u2079'; break; + } + if (c == 0) throw new RuntimeException(); + replace += c; + } + in = matcher.replaceFirst(replace); + } + return in; + } + @Override public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { this.entrySource = entrySource; int pageCount = 0; - final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); + File input = new File(file.getPath() + ".bz2"); + if (!input.exists()) input = new File(file.getPath() + ".gz"); + if (!input.exists()) input = new File(file.getPath() + ".xz"); + DataInputStream dis; + if (!input.exists()) { + // Fallback to uncompressed file + dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); + } else { + InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); + try { + InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); + dis = new DataInputStream(in); + } catch (CompressorException e) { + throw new IOException(e); + } + } try { while (true) { if (pageLimit >= 0 && pageCount >= pageLimit) { @@ -79,7 +131,7 @@ public abstract class AbstractWiktionaryParser implements Parser { dis.readFully(bytes); final String text = new String(bytes, "UTF8"); - parseSection(heading, text); + parseSection(heading, replaceSuperscript(text)); ++pageCount; if (pageCount % 1000 == 0) {