src/com/hughes/android/dictionary/parser/wiktionary/AbstractWiktionaryParser.java

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //     http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 package com.hughes.android.dictionary.parser.wiktionary;
  16
  17 import java.io.BufferedInputStream;
  18 import java.io.DataInputStream;
  19 import java.io.EOFException;
  20 import java.io.File;
  21 import java.io.FileInputStream;
  22 import java.io.IOException;
  23 import java.io.InputStream;
  24 import java.nio.charset.StandardCharsets;
  25 import java.util.LinkedHashMap;
  26 import java.util.LinkedHashSet;
  27 import java.util.List;
  28 import java.util.Map;
  29 import java.util.Set;
  30 import java.util.SortedMap;
  31 import java.util.TreeMap;
  32 import java.util.concurrent.atomic.AtomicInteger;
  33 import java.util.logging.Level;
  34 import java.util.logging.Logger;
  35 import java.util.regex.Matcher;
  36 import java.util.regex.Pattern;
  37
  38 import org.apache.commons.compress.compressors.CompressorException;
  39 import org.apache.commons.compress.compressors.CompressorStreamFactory;
  40
  41 import com.hughes.android.dictionary.engine.EntrySource;
  42 import com.hughes.android.dictionary.engine.EntryTypeName;
  43 import com.hughes.android.dictionary.engine.IndexBuilder;
  44 import com.hughes.android.dictionary.engine.IndexedEntry;
  45 import com.hughes.android.dictionary.engine.ReadAheadBuffer;
  46 import com.hughes.android.dictionary.parser.Parser;
  47 import com.hughes.android.dictionary.parser.WikiTokenizer;
  48 import com.hughes.util.EnumUtil;
  49
  50 public abstract class AbstractWiktionaryParser implements Parser {
  51
  52     static final Logger LOG = Logger.getLogger("WiktionaryParser");
  53
  54     private static final Pattern SUPERSCRIPT = Pattern.compile("<sup>[0-9]*</sup>");
  55
  56     final SortedMap<String, AtomicInteger> counters = new TreeMap<>();
  57     final Set<String> pairsAdded = new LinkedHashSet<>();
  58
  59     public EntrySource entrySource;
  60     public String title;
  61
  62
  63     abstract void parseSection(final String heading, final String text);
  64
  65     abstract void removeUselessArgs(final Map<String, String> namedArgs);
  66
  67     private static String replaceSuperscript(String in) {
  68         Matcher matcher;
  69         while ((matcher = SUPERSCRIPT.matcher(in)).find()) {
  70             String replace = "";
  71             String orig = matcher.group();
  72             for (int i = 5; i < orig.length() - 6; i++)
  73             {
  74                 char c = 0;
  75                 switch (orig.charAt(i)) {
  76                 case '0': c = '\u2070'; break;
  77                 case '1': c = '\u00b9'; break;
  78                 case '2': c = '\u00b2'; break;
  79                 case '3': c = '\u00b3'; break;
  80                 case '4': c = '\u2074'; break;
  81                 case '5': c = '\u2075'; break;
  82                 case '6': c = '\u2076'; break;
  83                 case '7': c = '\u2077'; break;
  84                 case '8': c = '\u2078'; break;
  85                 case '9': c = '\u2079'; break;
  86                 }
  87                 if (c == 0) throw new RuntimeException();
  88                 replace += c;
  89             }
  90             in = matcher.replaceFirst(replace);
  91         }
  92         return in;
  93     }
  94
  95     @Override
  96     public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
  97         this.entrySource = entrySource;
  98         int pageCount = 0;
  99         File input = new File(file.getPath() + ".bz2");
 100         if (!input.exists()) input = new File(file.getPath() + ".gz");
 101         if (!input.exists()) input = new File(file.getPath() + ".xz");
 102         DataInputStream dis;
 103         if (!input.exists()) {
 104             // Fallback to uncompressed file
 105             dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
 106         } else {
 107             InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
 108             try {
 109                 InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
 110                 in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
 111                 dis = new DataInputStream(in);
 112             } catch (CompressorException e) {
 113                 throw new IOException(e);
 114             }
 115         }
 116         try {
 117             while (true) {
 118                 if (pageLimit >= 0 && pageCount >= pageLimit) {
 119                     return;
 120                 }
 121
 122                 try {
 123                     title = dis.readUTF();
 124                 } catch (EOFException e) {
 125                     LOG.log(Level.INFO, "EOF reading split.");
 126                     dis.close();
 127                     return;
 128                 }
 129                 final String heading = dis.readUTF();
 130                 final int bytesLength = dis.readInt();
 131                 final byte[] bytes = new byte[bytesLength];
 132                 dis.readFully(bytes);
 133                 final String text = new String(bytes, StandardCharsets.UTF_8);
 134
 135                 parseSection(heading, replaceSuperscript(text));
 136
 137                 ++pageCount;
 138                 if (pageCount % 1000 == 0) {
 139                     LOG.info("pageCount=" + pageCount);
 140                 }
 141             }
 142         } finally {
 143             dis.close();
 144             LOG.info("***COUNTERS***");
 145             for (final Map.Entry<String, AtomicInteger> entry : counters.entrySet()) {
 146                 LOG.info(entry.getKey() + ": " + entry.getValue());
 147             }
 148         }
 149     }
 150
 151     static final Pattern whitespace = Pattern.compile("\\s+");
 152     static String trim(final String s) {
 153         return whitespace.matcher(s).replaceAll(" ").trim();
 154     }
 155
 156     public void incrementCount(final String string) {
 157         AtomicInteger counter = counters.get(string);
 158         if (counter == null) {
 159             counter = new AtomicInteger();
 160             counters.put(string, counter);
 161         }
 162         counter.incrementAndGet();
 163     }
 164
 165     public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) {
 166         assert false : token + ", title=" + title;
 167     }
 168
 169
 170     // -------------------------------------------------------------------------
 171
 172     static class AppendAndIndexWikiCallback<T extends AbstractWiktionaryParser> implements WikiTokenizer.Callback {
 173
 174         final T parser;
 175         StringBuilder builder;
 176         IndexedEntry indexedEntry;
 177         IndexBuilder indexBuilder;
 178         final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<>();
 179
 180         boolean entryTypeNameSticks = false;
 181         EntryTypeName entryTypeName = null;
 182
 183         final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<>();
 184
 185         final NameAndArgs<T> nameAndArgs = new NameAndArgs<>();
 186
 187         public AppendAndIndexWikiCallback(final T parser) {
 188             this.parser = parser;
 189         }
 190
 191         public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
 192             this.builder = builder;
 193             this.indexedEntry = indexedEntry;
 194             this.indexBuilder = null;
 195             entryTypeName = null;
 196             entryTypeNameSticks = false;
 197         }
 198
 199         public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
 200             final IndexBuilder oldIndexBuilder = this.indexBuilder;
 201             final EntryTypeName oldEntryTypeName = this.entryTypeName;
 202             this.indexBuilder = indexBuilder;
 203             if (!entryTypeNameSticks) {
 204                 this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
 205             }
 206             if (entryTypeName == null) this.entryTypeName = null;
 207             WikiTokenizer.dispatch(wikiText, false, this);
 208             this.indexBuilder = oldIndexBuilder;
 209             this.entryTypeName = oldEntryTypeName;
 210         }
 211
 212         public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
 213             final int start = builder.length();
 214             dispatch(wikiText, this.indexBuilder, entryTypeName);
 215             return builder.substring(start);
 216         }
 217
 218         @Override
 219         public void onPlainText(final String plainText) {
 220             // The only non-recursive callback.  Just appends to the builder, and indexes.
 221             builder.append(plainText);
 222             if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
 223                 indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
 224             }
 225         }
 226
 227         @Override
 228         public void onWikiLink(WikiTokenizer wikiTokenizer) {
 229             final String text = wikiTokenizer.wikiLinkText();
 230             @SuppressWarnings("unused")
 231             final String link = wikiTokenizer.wikiLinkDest();
 232             dispatch(text, entryTypeName);
 233         }
 234
 235         @Override
 236         public void onFunction(
 237             final WikiTokenizer wikiTokenizer,
 238             final String name,
 239             final List<String> args,
 240             final Map<String, String> namedArgs) {
 241
 242             FunctionCallback<T> functionCallback = functionCallbacks.get(name);
 243             if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
 244                 // Default function handling:
 245                 parser.removeUselessArgs(namedArgs);
 246                 final boolean single = args.isEmpty() && namedArgs.isEmpty();
 247                 builder.append(single ? "{" : "{{");
 248
 249                 final IndexBuilder oldIndexBuilder = indexBuilder;
 250                 indexBuilder = null;
 251                 nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
 252                 indexBuilder = oldIndexBuilder;
 253
 254                 builder.append(single ? "}" : "}}");
 255             }
 256         }
 257
 258         @Override
 259         public void onHtml(WikiTokenizer wikiTokenizer) {
 260             if (wikiTokenizer.token().startsWith("<ref>")) {
 261                 // Do nothing.
 262                 return;
 263             }
 264             // Unindexed for now.
 265             builder.append(wikiTokenizer.token());
 266         }
 267
 268         @Override
 269         public void onMarkup(WikiTokenizer wikiTokenizer) {
 270             // Do nothing.
 271         }
 272
 273         @Override
 274         public final void onComment(WikiTokenizer wikiTokenizer) {
 275             // Do nothing.
 276         }
 277
 278         @Override
 279         public void onNewline(WikiTokenizer wikiTokenizer) {
 280             assert false;
 281         }
 282
 283         @Override
 284         public void onHeading(WikiTokenizer wikiTokenizer) {
 285             assert false;
 286         }
 287
 288         @Override
 289         public void onListItem(WikiTokenizer wikiTokenizer) {
 290             assert false;
 291         }
 292
 293     }
 294
 295     // --------------------------------------------------------------------
 296
 297     static final class NameAndArgs<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
 298         @Override
 299         public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
 300                                       final Map<String, String> namedArgs, final T parser,
 301                                       final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
 302
 303             if (name != null) {
 304                 appendAndIndexWikiCallback.dispatch(name, null);
 305             }
 306             for (String arg : args) {
 307                 if (arg.length() > 0) {
 308                     appendAndIndexWikiCallback.builder.append("|");
 309                     appendAndIndexWikiCallback.dispatch(arg, null, null);
 310                 }
 311             }
 312             appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
 313             return true;
 314         }
 315     }
 316     static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<>();
 317
 318     static void appendNamedArgs(final Map<String, String> namedArgs,
 319                                 final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
 320         for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
 321             appendAndIndexWikiCallback.builder.append("|");
 322             appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
 323             appendAndIndexWikiCallback.builder.append("=");
 324             EntryTypeName entryTypeName = null;
 325             IndexBuilder indexBuilder = null;
 326             // This doesn't work: we'd need to add to word-forms.
 327 //      System.out.println(entry.getKey());
 328 //      if (entry.getKey().equals("tr")) {
 329 //        entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
 330 //        indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
 331 //      }
 332             appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);
 333         }
 334     }
 335
 336 }