1 // Copyright 2012 Google Inc. All Rights Reserved.
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 package com.hughes.android.dictionary.parser.wiktionary;
17 import java.io.BufferedInputStream;
18 import java.io.DataInputStream;
19 import java.io.EOFException;
21 import java.io.FileInputStream;
22 import java.io.IOException;
23 import java.util.LinkedHashMap;
24 import java.util.LinkedHashSet;
25 import java.util.List;
28 import java.util.SortedMap;
29 import java.util.TreeMap;
30 import java.util.concurrent.atomic.AtomicInteger;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33 import java.util.regex.Pattern;
35 import com.hughes.android.dictionary.engine.EntrySource;
36 import com.hughes.android.dictionary.engine.EntryTypeName;
37 import com.hughes.android.dictionary.engine.IndexBuilder;
38 import com.hughes.android.dictionary.engine.IndexedEntry;
39 import com.hughes.android.dictionary.parser.Parser;
40 import com.hughes.android.dictionary.parser.WikiTokenizer;
41 import com.hughes.util.EnumUtil;
43 public abstract class AbstractWiktionaryParser implements Parser {
45 static final Logger LOG = Logger.getLogger("WiktionaryParser");
47 final SortedMap<String, AtomicInteger> counters = new TreeMap<String, AtomicInteger>();
48 final Set<String> pairsAdded = new LinkedHashSet<String>();
50 public EntrySource entrySource;
54 abstract void parseSection(final String heading, final String text);
56 abstract void removeUselessArgs(final Map<String, String> namedArgs);
59 public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
60 this.entrySource = entrySource;
62 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
65 if (pageLimit >= 0 && pageCount >= pageLimit) {
70 title = dis.readUTF();
71 } catch (EOFException e) {
72 LOG.log(Level.INFO, "EOF reading split.");
76 final String heading = dis.readUTF();
77 final int bytesLength = dis.readInt();
78 final byte[] bytes = new byte[bytesLength];
80 final String text = new String(bytes, "UTF8");
82 parseSection(heading, text);
85 if (pageCount % 1000 == 0) {
86 LOG.info("pageCount=" + pageCount);
91 LOG.info("***COUNTERS***");
92 for (final Map.Entry<String, AtomicInteger> entry : counters.entrySet()) {
93 LOG.info(entry.getKey() + ": " + entry.getValue());
98 static final Pattern whitespace = Pattern.compile("\\s+");
99 static String trim(final String s) {
100 return whitespace.matcher(s).replaceAll(" ").trim();
103 public void incrementCount(final String string) {
104 AtomicInteger counter = counters.get(string);
105 if (counter == null) {
106 counter = new AtomicInteger();
107 counters.put(string, counter);
109 counter.incrementAndGet();
112 public void addLinkToCurrentEntry(final String token, final EntryTypeName entryTypeName) {
113 assert false : token;
117 // -------------------------------------------------------------------------
119 static class AppendAndIndexWikiCallback<T extends AbstractWiktionaryParser> implements WikiTokenizer.Callback {
122 StringBuilder builder;
123 IndexedEntry indexedEntry;
124 IndexBuilder indexBuilder;
125 final Map<String,FunctionCallback<T>> functionCallbacks = new LinkedHashMap<String, FunctionCallback<T>>();
127 boolean entryTypeNameSticks = false;
128 EntryTypeName entryTypeName = null;
130 final Map<String,AtomicInteger> langCodeToTCount = new LinkedHashMap<String, AtomicInteger>();
132 final NameAndArgs<T> nameAndArgs = new NameAndArgs<T>();
134 public AppendAndIndexWikiCallback(final T parser) {
135 this.parser = parser;
138 public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
139 this.builder = builder;
140 this.indexedEntry = indexedEntry;
141 this.indexBuilder = null;
142 entryTypeName = null;
143 entryTypeNameSticks = false;
146 public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
147 final IndexBuilder oldIndexBuilder = this.indexBuilder;
148 final EntryTypeName oldEntryTypeName = this.entryTypeName;
149 this.indexBuilder = indexBuilder;
150 if (!entryTypeNameSticks) {
151 this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
153 if (entryTypeName == null) this.entryTypeName = null;
154 WikiTokenizer.dispatch(wikiText, false, this);
155 this.indexBuilder = oldIndexBuilder;
156 this.entryTypeName = oldEntryTypeName;
159 public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
160 final int start = builder.length();
161 dispatch(wikiText, this.indexBuilder, entryTypeName);
162 return builder.substring(start);
166 public void onPlainText(final String plainText) {
167 // The only non-recursive callback. Just appends to the builder, and indexes.
168 builder.append(plainText);
169 if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
170 indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
175 public void onWikiLink(WikiTokenizer wikiTokenizer) {
176 final String text = wikiTokenizer.wikiLinkText();
177 @SuppressWarnings("unused")
178 final String link = wikiTokenizer.wikiLinkDest();
179 dispatch(text, entryTypeName);
183 public void onFunction(
184 final WikiTokenizer wikiTokenizer,
186 final List<String> args,
187 final Map<String, String> namedArgs) {
189 FunctionCallback<T> functionCallback = functionCallbacks.get(name);
190 if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
191 // Default function handling:
192 parser.removeUselessArgs(namedArgs);
193 final boolean single = args.isEmpty() && namedArgs.isEmpty();
194 builder.append(single ? "{" : "{{");
196 final IndexBuilder oldIndexBuilder = indexBuilder;
198 nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
199 indexBuilder = oldIndexBuilder;
201 builder.append(single ? "}" : "}}");
206 public void onHtml(WikiTokenizer wikiTokenizer) {
207 // Unindexed for now.
208 builder.append(wikiTokenizer.token());
212 public void onMarkup(WikiTokenizer wikiTokenizer) {
217 public final void onComment(WikiTokenizer wikiTokenizer) {
222 public void onNewline(WikiTokenizer wikiTokenizer) {
227 public void onHeading(WikiTokenizer wikiTokenizer) {
232 public void onListItem(WikiTokenizer wikiTokenizer) {
238 // --------------------------------------------------------------------
240 static final class NameAndArgs<T extends AbstractWiktionaryParser> implements FunctionCallback<T> {
242 public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List<String> args,
243 final Map<String, String> namedArgs, final T parser,
244 final AppendAndIndexWikiCallback<T> appendAndIndexWikiCallback) {
247 appendAndIndexWikiCallback.dispatch(name, null);
249 for (int i = 0; i < args.size(); ++i) {
250 if (args.get(i).length() > 0) {
251 appendAndIndexWikiCallback.builder.append("|");
252 appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
255 appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
259 static NameAndArgs<AbstractWiktionaryParser> NAME_AND_ARGS = new NameAndArgs<AbstractWiktionaryParser>();
261 static void appendNamedArgs(final Map<String, String> namedArgs,
262 final AppendAndIndexWikiCallback<?> appendAndIndexWikiCallback) {
263 for (final Map.Entry<String, String> entry : namedArgs.entrySet()) {
264 appendAndIndexWikiCallback.builder.append("|");
265 appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
266 appendAndIndexWikiCallback.builder.append("=");
267 EntryTypeName entryTypeName = null;
268 IndexBuilder indexBuilder = null;
269 // This doesn't work: we'd need to add to word-forms.
270 // System.out.println(entry.getKey());
271 // if (entry.getKey().equals("tr")) {
272 // entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
273 // indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
275 appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);