X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FAbstractWiktionaryParser.java;h=81a676c8755909cd19badc6fe799aabd538609dd;hp=c66a0d78010ed32e325e731843af33026bb0c640;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hpb=f4144760af0445ac225ed3b5a2373723bf78e4d1 diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index c66a0d7..81a676c 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -19,8 +19,9 @@ import java.io.DataInputStream; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; @@ -31,10 +32,11 @@ import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.EntryTypeName; @@ -49,8 +51,10 @@ public abstract class AbstractWiktionaryParser implements Parser { static final Logger LOG = Logger.getLogger("WiktionaryParser"); - final SortedMap counters = new TreeMap(); - final Set pairsAdded = new LinkedHashSet(); + private static final Pattern SUPERSCRIPT = Pattern.compile("[0-9]*"); + + final SortedMap counters = new TreeMap<>(); + final Set pairsAdded = new LinkedHashSet<>(); public EntrySource entrySource; public String title; @@ -60,6 +64,34 @@ public abstract class AbstractWiktionaryParser implements Parser { abstract void removeUselessArgs(final Map namedArgs); + private static String replaceSuperscript(String in) { + Matcher matcher; + while ((matcher = SUPERSCRIPT.matcher(in)).find()) { + String replace = ""; + String orig = matcher.group(); + for (int i = 5; i < orig.length() - 6; i++) + { + char c = 0; + switch (orig.charAt(i)) { + case '0': c = '\u2070'; break; + case '1': c = '\u00b9'; break; + case '2': c = '\u00b2'; break; + case '3': c = '\u00b3'; break; + case '4': c = '\u2074'; break; + case '5': c = '\u2075'; break; + case '6': c = '\u2076'; break; + case '7': c = '\u2077'; break; + case '8': c = '\u2078'; break; + case '9': c = '\u2079'; break; + } + if (c == 0) throw new RuntimeException(); + replace += c; + } + in = matcher.replaceFirst(replace); + } + return in; + } + @Override public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { this.entrySource = entrySource; @@ -67,7 +99,7 @@ public abstract class AbstractWiktionaryParser implements Parser { File input = new File(file.getPath() + ".bz2"); if (!input.exists()) input = new File(file.getPath() + ".gz"); if (!input.exists()) input = new File(file.getPath() + ".xz"); - DataInputStream dis; + DataInputStream dis; if (!input.exists()) { // Fallback to uncompressed file dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); @@ -98,9 +130,9 @@ public abstract class AbstractWiktionaryParser implements Parser { final int bytesLength = dis.readInt(); final byte[] bytes = new byte[bytesLength]; dis.readFully(bytes); - final String text = new String(bytes, "UTF8"); + final String text = new String(bytes, StandardCharsets.UTF_8); - parseSection(heading, text); + parseSection(heading, replaceSuperscript(text)); ++pageCount; if (pageCount % 1000 == 0) { @@ -143,14 +175,14 @@ public abstract class AbstractWiktionaryParser implements Parser { StringBuilder builder; IndexedEntry indexedEntry; IndexBuilder indexBuilder; - final Map> functionCallbacks = new LinkedHashMap>(); + final Map> functionCallbacks = new LinkedHashMap<>(); boolean entryTypeNameSticks = false; EntryTypeName entryTypeName = null; - final Map langCodeToTCount = new LinkedHashMap(); + final Map langCodeToTCount = new LinkedHashMap<>(); - final NameAndArgs nameAndArgs = new NameAndArgs(); + final NameAndArgs nameAndArgs = new NameAndArgs<>(); public AppendAndIndexWikiCallback(final T parser) { this.parser = parser; @@ -271,17 +303,17 @@ public abstract class AbstractWiktionaryParser implements Parser { if (name != null) { appendAndIndexWikiCallback.dispatch(name, null); } - for (int i = 0; i < args.size(); ++i) { - if (args.get(i).length() > 0) { + for (String arg : args) { + if (arg.length() > 0) { appendAndIndexWikiCallback.builder.append("|"); - appendAndIndexWikiCallback.dispatch(args.get(i), null, null); + appendAndIndexWikiCallback.dispatch(arg, null, null); } } appendNamedArgs(namedArgs, appendAndIndexWikiCallback); return true; } } - static NameAndArgs NAME_AND_ARGS = new NameAndArgs(); + static NameAndArgs NAME_AND_ARGS = new NameAndArgs<>(); static void appendNamedArgs(final Map namedArgs, final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {