import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+
import com.hughes.android.dictionary.engine.EntrySource;
import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
+import com.hughes.android.dictionary.engine.ReadAheadBuffer;
import com.hughes.android.dictionary.parser.Parser;
import com.hughes.android.dictionary.parser.WikiTokenizer;
import com.hughes.util.EnumUtil;
static final Logger LOG = Logger.getLogger("WiktionaryParser");
- final SortedMap<String, AtomicInteger> counters = new TreeMap<String, AtomicInteger>();
- final Set<String> pairsAdded = new LinkedHashSet<String>();
+ private static final Pattern SUPERSCRIPT = Pattern.compile("<sup>[0-9]*</sup>");
+
+ final SortedMap<String, AtomicInteger> counters = new TreeMap<>();
+ final Set<String> pairsAdded = new LinkedHashSet<>();
public EntrySource entrySource;
public String title;
abstract void removeUselessArgs(final Map<String, String> namedArgs);
+ private static String replaceSuperscript(String in) {
+ Matcher matcher;
+ while ((matcher = SUPERSCRIPT.matcher(in)).find()) {
+ String replace = "";
+ String orig = matcher.group();
+ for (int i = 5; i < orig.length() - 6; i++)
+ {
+ char c = 0;
+ switch (orig.charAt(i)) {
+ case '0': c = '\u2070'; break;
+ case '1': c = '\u00b9'; break;
+ case '2': c = '\u00b2'; break;
+ case '3': c = '\u00b3'; break;
+ case '4': c = '\u2074'; break;
+ case '5': c = '\u2075'; break;
+ case '6': c = '\u2076'; break;
+ case '7': c = '\u2077'; break;
+ case '8': c = '\u2078'; break;
+ case '9': c = '\u2079'; break;
+ }
+ if (c == 0) throw new RuntimeException();
+ replace += c;
+ }
+ in = matcher.replaceFirst(replace);
+ }
+ return in;
+ }
+
@Override
public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
this.entrySource = entrySource;
int pageCount = 0;
- final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+ File input = new File(file.getPath() + ".bz2");
+ if (!input.exists()) input = new File(file.getPath() + ".gz");
+ if (!input.exists()) input = new File(file.getPath() + ".xz");
+ DataInputStream dis;
+ if (!input.exists()) {
+ // Fallback to uncompressed file
+ dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+ } else {
+ InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
+ try {
+ InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
+ in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
+ dis = new DataInputStream(in);
+ } catch (CompressorException e) {
+ throw new IOException(e);
+ }
+ }
try {
while (true) {
if (pageLimit >= 0 && pageCount >= pageLimit) {
final int bytesLength = dis.readInt();
final byte[] bytes = new byte[bytesLength];
dis.readFully(bytes);
- final String text = new String(bytes, "UTF8");
+ final String text = new String(bytes, StandardCharsets.UTF_8);
- parseSection(heading, text);
+ parseSection(heading, replaceSuperscript(text));
++pageCount;
if (pageCount % 1000 == 0) {
StringBuilder builder;
IndexedEntry indexedEntry;
IndexBuilder indexBuilder;
- final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<String, FunctionCallback<T>>();
+ final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<>();
boolean entryTypeNameSticks = false;
EntryTypeName entryTypeName = null;
- final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<String, AtomicInteger>();
+ final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<>();
- final NameAndArgs<T> nameAndArgs = new NameAndArgs<T>();
+ final NameAndArgs<T> nameAndArgs = new NameAndArgs<>();
public AppendAndIndexWikiCallback(final T parser) {
this.parser = parser;
if (name != null) {
appendAndIndexWikiCallback.dispatch(name, null);
}
- for (int i = 0; i < args.size(); ++i) {
- if (args.get(i).length() > 0) {
+ for (String arg : args) {
+ if (arg.length() > 0) {
appendAndIndexWikiCallback.builder.append("|");
- appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
+ appendAndIndexWikiCallback.dispatch(arg, null, null);
}
}
appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
return true;
}
}
- static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<AbstractWiktionaryParser>();
+ static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<>();
static void appendNamedArgs(final Map<String, String> namedArgs,
final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {