X-Git-Url: http://gitweb.fperrin.net/?p=DictionaryPC.git;a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FAbstractWiktionaryParser.java;h=81a676c8755909cd19badc6fe799aabd538609dd;hp=c3bd2ffb1757377baf1b78d1cbe1643086afde9b;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hpb=cb8c8722c1d928396d118cf420505bda8776b308 diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java index c3bd2ff..81a676c 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java @@ -20,6 +20,8 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; @@ -30,242 +32,305 @@ import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; + import com.hughes.android.dictionary.engine.EntrySource; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.engine.ReadAheadBuffer; import com.hughes.android.dictionary.parser.Parser; import com.hughes.android.dictionary.parser.WikiTokenizer; import com.hughes.util.EnumUtil; public abstract class AbstractWiktionaryParser implements Parser { - static final Logger LOG = Logger.getLogger("WiktionaryParser"); - - final SortedMap counters = new TreeMap(); - final Set pairsAdded = new LinkedHashSet(); - - EntrySource entrySource; - String title; - - - abstract void parseSection(final String heading, final String text); - - @Override - public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { - this.entrySource = entrySource; - int pageCount = 0; - final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); - try { - while (true) { - if (pageLimit >= 0 && pageCount >= pageLimit) { - return; - } - - try { - title = dis.readUTF(); - } catch (EOFException e) { - LOG.log(Level.INFO, "EOF reading split."); - dis.close(); - return; - } - final String heading = dis.readUTF(); - final int bytesLength = dis.readInt(); - final byte[] bytes = new byte[bytesLength]; - dis.readFully(bytes); - final String text = new String(bytes, "UTF8"); - - parseSection(heading, text); - - ++pageCount; - if (pageCount % 1000 == 0) { - LOG.info("pageCount=" + pageCount); - } - } - } finally { - LOG.info("***COUNTERS***"); - for (final Map.Entry entry : counters.entrySet()) { - LOG.info(entry.getKey() + ": " + entry.getValue()); - } - } - } - - static final Pattern whitespace = Pattern.compile("\\s+"); - static String trim(final String s) { - return whitespace.matcher(s).replaceAll(" ").trim(); - } - - public void incrementCount(final String string) { - AtomicInteger counter = counters.get(string); - if (counter != null) { - counter = new AtomicInteger(); - counters.put(string, counter); - } - counter.incrementAndGet(); - } - - - // ------------------------------------------------------------------------- - - static final class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { - - final T parser; - StringBuilder builder; - IndexedEntry indexedEntry; - IndexBuilder indexBuilder; - final Map> functionCallbacks = new LinkedHashMap>(); - - boolean entryTypeNameSticks = false; - EntryTypeName entryTypeName = null; - - final Map langCodeToTCount = new LinkedHashMap(); - - final NameAndArgs nameAndArgs = new NameAndArgs(); - - public AppendAndIndexWikiCallback(final T parser) { - this.parser = parser; - } - - public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) { - this.builder = builder; - this.indexedEntry = indexedEntry; - this.indexBuilder = null; - entryTypeName = null; - entryTypeNameSticks = false; - } - - public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) { - final IndexBuilder oldIndexBuilder = this.indexBuilder; - final EntryTypeName oldEntryTypeName = this.entryTypeName; - this.indexBuilder = indexBuilder; - if (!entryTypeNameSticks) { - this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName); - } - if (entryTypeName == null) this.entryTypeName = null; - WikiTokenizer.dispatch(wikiText, false, this); - this.indexBuilder = oldIndexBuilder; - this.entryTypeName = oldEntryTypeName; - } - - public void dispatch(final String wikiText, final EntryTypeName entryTypeName) { - dispatch(wikiText, this.indexBuilder, entryTypeName); - } + static final Logger LOG = Logger.getLogger("WiktionaryParser"); - @Override - public void onPlainText(final String plainText) { - // The only non-recursive callback. Just appends to the builder, and indexes. - builder.append(plainText); - if (indexBuilder != null && entryTypeName != null && indexedEntry != null) { - indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName); - } - } + private static final Pattern SUPERSCRIPT = Pattern.compile("[0-9]*"); - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - final String text = wikiTokenizer.wikiLinkText(); - @SuppressWarnings("unused") - final String link = wikiTokenizer.wikiLinkDest(); - dispatch(text, entryTypeName); - } + final SortedMap counters = new TreeMap<>(); + final Set pairsAdded = new LinkedHashSet<>(); - @Override - public void onFunction( - final WikiTokenizer wikiTokenizer, - final String name, - final List args, - final Map namedArgs) { - - FunctionCallback functionCallback = functionCallbacks.get(name); - if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) { - // Default function handling: -// namedArgs.keySet().removeAll(EnWiktionaryXmlParser.USELESS_WIKI_ARGS); - final boolean single = args.isEmpty() && namedArgs.isEmpty(); - builder.append(single ? "{" : "{{"); - - final IndexBuilder oldIndexBuilder = indexBuilder; - indexBuilder = null; - nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this); - indexBuilder = oldIndexBuilder; - - builder.append(single ? "}" : "}}"); - } - } + public EntrySource entrySource; + public String title; - @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - // Unindexed for now. - builder.append(wikiTokenizer.token()); + + abstract void parseSection(final String heading, final String text); + + abstract void removeUselessArgs(final Map namedArgs); + + private static String replaceSuperscript(String in) { + Matcher matcher; + while ((matcher = SUPERSCRIPT.matcher(in)).find()) { + String replace = ""; + String orig = matcher.group(); + for (int i = 5; i < orig.length() - 6; i++) + { + char c = 0; + switch (orig.charAt(i)) { + case '0': c = '\u2070'; break; + case '1': c = '\u00b9'; break; + case '2': c = '\u00b2'; break; + case '3': c = '\u00b3'; break; + case '4': c = '\u2074'; break; + case '5': c = '\u2075'; break; + case '6': c = '\u2076'; break; + case '7': c = '\u2077'; break; + case '8': c = '\u2078'; break; + case '9': c = '\u2079'; break; + } + if (c == 0) throw new RuntimeException(); + replace += c; + } + in = matcher.replaceFirst(replace); + } + return in; } @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - // Do nothing. + public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException { + this.entrySource = entrySource; + int pageCount = 0; + File input = new File(file.getPath() + ".bz2"); + if (!input.exists()) input = new File(file.getPath() + ".gz"); + if (!input.exists()) input = new File(file.getPath() + ".xz"); + DataInputStream dis; + if (!input.exists()) { + // Fallback to uncompressed file + dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); + } else { + InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); + try { + InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); + dis = new DataInputStream(in); + } catch (CompressorException e) { + throw new IOException(e); + } + } + try { + while (true) { + if (pageLimit >= 0 && pageCount >= pageLimit) { + return; + } + + try { + title = dis.readUTF(); + } catch (EOFException e) { + LOG.log(Level.INFO, "EOF reading split."); + dis.close(); + return; + } + final String heading = dis.readUTF(); + final int bytesLength = dis.readInt(); + final byte[] bytes = new byte[bytesLength]; + dis.readFully(bytes); + final String text = new String(bytes, StandardCharsets.UTF_8); + + parseSection(heading, replaceSuperscript(text)); + + ++pageCount; + if (pageCount % 1000 == 0) { + LOG.info("pageCount=" + pageCount); + } + } + } finally { + dis.close(); + LOG.info("***COUNTERS***"); + for (final Map.Entry entry : counters.entrySet()) { + LOG.info(entry.getKey() + ": " + entry.getValue()); + } + } } - @Override - public final void onComment(WikiTokenizer wikiTokenizer) { - // Do nothing. + static final Pattern whitespace = Pattern.compile("\\s+"); + static String trim(final String s) { + return whitespace.matcher(s).replaceAll(" ").trim(); } - @Override - public final void onNewline(WikiTokenizer wikiTokenizer) { - assert false; + public void incrementCount(final String string) { + AtomicInteger counter = counters.get(string); + if (counter == null) { + counter = new AtomicInteger(); + counters.put(string, counter); + } + counter.incrementAndGet(); } - @Override - public final void onHeading(WikiTokenizer wikiTokenizer) { - assert false; + public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) { + assert false : token + ", title=" + title; } - @Override - public final void onListItem(WikiTokenizer wikiTokenizer) { - assert false; + + // ------------------------------------------------------------------------- + + static class AppendAndIndexWikiCallback implements WikiTokenizer.Callback { + + final T parser; + StringBuilder builder; + IndexedEntry indexedEntry; + IndexBuilder indexBuilder; + final Map> functionCallbacks = new LinkedHashMap<>(); + + boolean entryTypeNameSticks = false; + EntryTypeName entryTypeName = null; + + final Map langCodeToTCount = new LinkedHashMap<>(); + + final NameAndArgs nameAndArgs = new NameAndArgs<>(); + + public AppendAndIndexWikiCallback(final T parser) { + this.parser = parser; + } + + public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) { + this.builder = builder; + this.indexedEntry = indexedEntry; + this.indexBuilder = null; + entryTypeName = null; + entryTypeNameSticks = false; + } + + public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) { + final IndexBuilder oldIndexBuilder = this.indexBuilder; + final EntryTypeName oldEntryTypeName = this.entryTypeName; + this.indexBuilder = indexBuilder; + if (!entryTypeNameSticks) { + this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName); + } + if (entryTypeName == null) this.entryTypeName = null; + WikiTokenizer.dispatch(wikiText, false, this); + this.indexBuilder = oldIndexBuilder; + this.entryTypeName = oldEntryTypeName; + } + + public String dispatch(final String wikiText, final EntryTypeName entryTypeName) { + final int start = builder.length(); + dispatch(wikiText, this.indexBuilder, entryTypeName); + return builder.substring(start); + } + + @Override + public void onPlainText(final String plainText) { + // The only non-recursive callback. Just appends to the builder, and indexes. + builder.append(plainText); + if (indexBuilder != null && entryTypeName != null && indexedEntry != null) { + indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName); + } + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + final String text = wikiTokenizer.wikiLinkText(); + @SuppressWarnings("unused") + final String link = wikiTokenizer.wikiLinkDest(); + dispatch(text, entryTypeName); + } + + @Override + public void onFunction( + final WikiTokenizer wikiTokenizer, + final String name, + final List args, + final Map namedArgs) { + + FunctionCallback functionCallback = functionCallbacks.get(name); + if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) { + // Default function handling: + parser.removeUselessArgs(namedArgs); + final boolean single = args.isEmpty() && namedArgs.isEmpty(); + builder.append(single ? "{" : "{{"); + + final IndexBuilder oldIndexBuilder = indexBuilder; + indexBuilder = null; + nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this); + indexBuilder = oldIndexBuilder; + + builder.append(single ? "}" : "}}"); + } + } + + @Override + public void onHtml(WikiTokenizer wikiTokenizer) { + if (wikiTokenizer.token().startsWith("")) { + // Do nothing. + return; + } + // Unindexed for now. + builder.append(wikiTokenizer.token()); + } + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public final void onComment(WikiTokenizer wikiTokenizer) { + // Do nothing. + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + assert false; + } + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + assert false; + } + } - } - - // -------------------------------------------------------------------- - - static final class NameAndArgs implements FunctionCallback { - @Override - public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, - final Map namedArgs, final T parser, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - - if (name != null) { - appendAndIndexWikiCallback.builder.append(name); - } - for (int i = 0; i < args.size(); ++i) { - if (args.get(i).length() > 0) { - appendAndIndexWikiCallback.builder.append("|"); - appendAndIndexWikiCallback.dispatch(args.get(i), null, null); + + // -------------------------------------------------------------------- + + static final class NameAndArgs implements FunctionCallback { + @Override + public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args, + final Map namedArgs, final T parser, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + + if (name != null) { + appendAndIndexWikiCallback.dispatch(name, null); + } + for (String arg : args) { + if (arg.length() > 0) { + appendAndIndexWikiCallback.builder.append("|"); + appendAndIndexWikiCallback.dispatch(arg, null, null); + } + } + appendNamedArgs(namedArgs, appendAndIndexWikiCallback); + return true; } - } - appendNamedArgs(namedArgs, appendAndIndexWikiCallback); - return true; } - } - static NameAndArgs NAME_AND_ARGS = new NameAndArgs(); - - static void appendNamedArgs(final Map namedArgs, - final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { - for (final Map.Entry entry : namedArgs.entrySet()) { - appendAndIndexWikiCallback.builder.append("|"); - appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null); - appendAndIndexWikiCallback.builder.append("="); - EntryTypeName entryTypeName = null; - IndexBuilder indexBuilder = null; - // This doesn't work: we'd need to add to word-forms. + static NameAndArgs NAME_AND_ARGS = new NameAndArgs<>(); + + static void appendNamedArgs(final Map namedArgs, + final AppendAndIndexWikiCallback appendAndIndexWikiCallback) { + for (final Map.Entry entry : namedArgs.entrySet()) { + appendAndIndexWikiCallback.builder.append("|"); + appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null); + appendAndIndexWikiCallback.builder.append("="); + EntryTypeName entryTypeName = null; + IndexBuilder indexBuilder = null; + // This doesn't work: we'd need to add to word-forms. // System.out.println(entry.getKey()); // if (entry.getKey().equals("tr")) { // entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION; // indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder; // } - appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName); + appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName); + } } - } - - }