1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import java.io.BufferedInputStream;
18 import java.io.DataInputStream;
19 import java.io.EOFException;
21 import java.io.FileInputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.nio.charset.StandardCharsets;
25 import java.util.LinkedHashMap;
26 import java.util.LinkedHashSet;
27 import java.util.List;
30 import java.util.SortedMap;
31 import java.util.TreeMap;
32 import java.util.concurrent.atomic.AtomicInteger;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
38 import org.apache.commons.compress.compressors.CompressorException;
39 import org.apache.commons.compress.compressors.CompressorStreamFactory;
41 import com.hughes.android.dictionary.engine.EntrySource;
42 import com.hughes.android.dictionary.engine.EntryTypeName;
43 import com.hughes.android.dictionary.engine.IndexBuilder;
44 import com.hughes.android.dictionary.engine.IndexedEntry;
45 import com.hughes.android.dictionary.engine.ReadAheadBuffer;
46 import com.hughes.android.dictionary.parser.Parser;
47 import com.hughes.android.dictionary.parser.WikiTokenizer;
48 import com.hughes.util.EnumUtil;
50 public abstract class AbstractWiktionaryParser implements Parser {
52 static final Logger LOG = Logger.getLogger("WiktionaryParser");
54 private static final Pattern SUPERSCRIPT = Pattern.compile("<sup>[0-9]*</sup>");
56 final SortedMap<String, AtomicInteger> counters = new TreeMap<>();
57 final Set<String> pairsAdded = new LinkedHashSet<>();
59 public EntrySource entrySource;
63 abstract void parseSection(final String heading, final String text);
65 abstract void removeUselessArgs(final Map<String, String> namedArgs);
67 private static String replaceSuperscript(String in) {
69 while ((matcher = SUPERSCRIPT.matcher(in)).find()) {
71 String orig = matcher.group();
72 for (int i = 5; i < orig.length() - 6; i++)
75 switch (orig.charAt(i)) {
76 case '0': c = '\u2070'; break;
77 case '1': c = '\u00b9'; break;
78 case '2': c = '\u00b2'; break;
79 case '3': c = '\u00b3'; break;
80 case '4': c = '\u2074'; break;
81 case '5': c = '\u2075'; break;
82 case '6': c = '\u2076'; break;
83 case '7': c = '\u2077'; break;
84 case '8': c = '\u2078'; break;
85 case '9': c = '\u2079'; break;
87 if (c == 0) throw new RuntimeException();
90 in = matcher.replaceFirst(replace);
96 public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
97 this.entrySource = entrySource;
99 File input = new File(file.getPath() + ".bz2");
100 if (!input.exists()) input = new File(file.getPath() + ".gz");
101 if (!input.exists()) input = new File(file.getPath() + ".xz");
103 if (!input.exists()) {
104 // Fallback to uncompressed file
105 dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
107 InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
109 InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
110 in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
111 dis = new DataInputStream(in);
112 } catch (CompressorException e) {
113 throw new IOException(e);
118 if (pageLimit >= 0 && pageCount >= pageLimit) {
123 title = dis.readUTF();
124 } catch (EOFException e) {
125 LOG.log(Level.INFO, "EOF reading split.");
129 final String heading = dis.readUTF();
130 final int bytesLength = dis.readInt();
131 final byte[] bytes = new byte[bytesLength];
132 dis.readFully(bytes);
133 final String text = new String(bytes, StandardCharsets.UTF_8);
135 parseSection(heading, replaceSuperscript(text));
138 if (pageCount % 1000 == 0) {
139 LOG.info("pageCount=" + pageCount);
144 LOG.info("***COUNTERS***");
145 for (final Map.Entry<String, AtomicInteger> entry : counters.entrySet()) {
146 LOG.info(entry.getKey() + ": " + entry.getValue());
151 static final Pattern whitespace = Pattern.compile("\\s+");
152 static String trim(final String s) {
153 return whitespace.matcher(s).replaceAll(" ").trim();
156 public void incrementCount(final String string) {
157 AtomicInteger counter = counters.get(string);
158 if (counter == null) {
159 counter = new AtomicInteger();
160 counters.put(string, counter);
162 counter.incrementAndGet();
165 public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) {
166 assert false : token + ", title=" + title;
170 // -------------------------------------------------------------------------
172 static class AppendAndIndexWikiCallback<T extends AbstractWiktionaryParser> implements WikiTokenizer.Callback {
175 StringBuilder builder;
176 IndexedEntry indexedEntry;
177 IndexBuilder indexBuilder;
178 final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<>();
180 boolean entryTypeNameSticks = false;
181 EntryTypeName entryTypeName = null;
183 final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<>();
185 final NameAndArgs<T> nameAndArgs = new NameAndArgs<>();
187 public AppendAndIndexWikiCallback(final T parser) {
188 this.parser = parser;
191 public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
192 this.builder = builder;
193 this.indexedEntry = indexedEntry;
194 this.indexBuilder = null;
195 entryTypeName = null;
196 entryTypeNameSticks = false;
199 public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
200 final IndexBuilder oldIndexBuilder = this.indexBuilder;
201 final EntryTypeName oldEntryTypeName = this.entryTypeName;
202 this.indexBuilder = indexBuilder;
203 if (!entryTypeNameSticks) {
204 this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
206 if (entryTypeName == null) this.entryTypeName = null;
207 WikiTokenizer.dispatch(wikiText, false, this);
208 this.indexBuilder = oldIndexBuilder;
209 this.entryTypeName = oldEntryTypeName;
212 public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
213 final int start = builder.length();
214 dispatch(wikiText, this.indexBuilder, entryTypeName);
215 return builder.substring(start);
219 public void onPlainText(final String plainText) {
220 // The only non-recursive callback. Just appends to the builder, and indexes.
221 builder.append(plainText);
222 if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
223 indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
228 public void onWikiLink(WikiTokenizer wikiTokenizer) {
229 final String text = wikiTokenizer.wikiLinkText();
230 @SuppressWarnings("unused")
231 final String link = wikiTokenizer.wikiLinkDest();
232 dispatch(text, entryTypeName);
236 public void onFunction(
237 final WikiTokenizer wikiTokenizer,
239 final List<String> args,
240 final Map<String, String> namedArgs) {
242 FunctionCallback<T> functionCallback = functionCallbacks.get(name);
243 if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
244 // Default function handling:
245 parser.removeUselessArgs(namedArgs);
246 final boolean single = args.isEmpty() && namedArgs.isEmpty();
247 builder.append(single ? "{" : "{{");
249 final IndexBuilder oldIndexBuilder = indexBuilder;
251 nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
252 indexBuilder = oldIndexBuilder;
254 builder.append(single ? "}" : "}}");
259 public void onHtml(WikiTokenizer wikiTokenizer) {
260 if (wikiTokenizer.token().startsWith("<ref>")) {
264 // Unindexed for now.
265 builder.append(wikiTokenizer.token());
269 public void onMarkup(WikiTokenizer wikiTokenizer) {
274 public final void onComment(WikiTokenizer wikiTokenizer) {
279 public void onNewline(WikiTokenizer wikiTokenizer) {
284 public void onHeading(WikiTokenizer wikiTokenizer) {
289 public void onListItem(WikiTokenizer wikiTokenizer) {
295 // --------------------------------------------------------------------
297 static final class NameAndArgs<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
299 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
300 final Map<String, String> namedArgs, final T parser,
301 final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
304 appendAndIndexWikiCallback.dispatch(name, null);
306 for (String arg : args) {
307 if (arg.length() > 0) {
308 appendAndIndexWikiCallback.builder.append("|");
309 appendAndIndexWikiCallback.dispatch(arg, null, null);
312 appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
316 static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<>();
318 static void appendNamedArgs(final Map<String, String> namedArgs,
319 final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
320 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
321 appendAndIndexWikiCallback.builder.append("|");
322 appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
323 appendAndIndexWikiCallback.builder.append("=");
324 EntryTypeName entryTypeName = null;
325 IndexBuilder indexBuilder = null;
326 // This doesn't work: we'd need to add to word-forms.
327 // System.out.println(entry.getKey());
328 // if (entry.getKey().equals("tr")) {
329 // entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
330 // indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
332 appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);