// Copyright 2012 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.hughes.android.dictionary.parser.wiktionary;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.compress.compressors.CompressorException;
import com.hughes.android.dictionary.engine.EntrySource;
import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
import com.hughes.android.dictionary.engine.ReadAheadBuffer;
import com.hughes.android.dictionary.parser.Parser;
import com.hughes.android.dictionary.parser.WikiTokenizer;
import com.hughes.util.EnumUtil;
public abstract class AbstractWiktionaryParser implements Parser {
static final Logger LOG = Logger.getLogger("WiktionaryParser");
private static final Pattern SUPERSCRIPT = Pattern.compile("[0-9]*");
final SortedMap counters = new TreeMap();
final Set pairsAdded = new LinkedHashSet();
public EntrySource entrySource;
public String title;
abstract void parseSection(final String heading, final String text);
abstract void removeUselessArgs(final Map namedArgs);
private static String replaceSuperscript(String in) {
Matcher matcher;
while ((matcher = SUPERSCRIPT.matcher(in)).find()) {
String replace = "";
String orig = matcher.group();
for (int i = 5; i < orig.length() - 6; i++)
{
char c = 0;
switch (orig.charAt(i)) {
case '0': c = '\u2070'; break;
case '1': c = '\u00b9'; break;
case '2': c = '\u00b2'; break;
case '3': c = '\u00b3'; break;
case '4': c = '\u2074'; break;
case '5': c = '\u2075'; break;
case '6': c = '\u2076'; break;
case '7': c = '\u2077'; break;
case '8': c = '\u2078'; break;
case '9': c = '\u2079'; break;
}
if (c == 0) throw new RuntimeException();
replace += c;
}
in = matcher.replaceFirst(replace);
}
return in;
}
@Override
public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
this.entrySource = entrySource;
int pageCount = 0;
File input = new File(file.getPath() + ".bz2");
if (!input.exists()) input = new File(file.getPath() + ".gz");
if (!input.exists()) input = new File(file.getPath() + ".xz");
DataInputStream dis;
if (!input.exists()) {
// Fallback to uncompressed file
dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
} else {
InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
try {
InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
dis = new DataInputStream(in);
} catch (CompressorException e) {
throw new IOException(e);
}
}
try {
while (true) {
if (pageLimit >= 0 && pageCount >= pageLimit) {
return;
}
try {
title = dis.readUTF();
} catch (EOFException e) {
LOG.log(Level.INFO, "EOF reading split.");
dis.close();
return;
}
final String heading = dis.readUTF();
final int bytesLength = dis.readInt();
final byte[] bytes = new byte[bytesLength];
dis.readFully(bytes);
final String text = new String(bytes, "UTF8");
parseSection(heading, replaceSuperscript(text));
++pageCount;
if (pageCount % 1000 == 0) {
LOG.info("pageCount=" + pageCount);
}
}
} finally {
dis.close();
LOG.info("***COUNTERS***");
for (final Map.Entry entry : counters.entrySet()) {
LOG.info(entry.getKey() + ": " + entry.getValue());
}
}
}
static final Pattern whitespace = Pattern.compile("\\s+");
static String trim(final String s) {
return whitespace.matcher(s).replaceAll(" ").trim();
}
public void incrementCount(final String string) {
AtomicInteger counter = counters.get(string);
if (counter == null) {
counter = new AtomicInteger();
counters.put(string, counter);
}
counter.incrementAndGet();
}
public void addLinkToCurrentEntry(final String token, final String lang, final EntryTypeName entryTypeName) {
assert false : token + ", title=" + title;
}
// -------------------------------------------------------------------------
static class AppendAndIndexWikiCallback implements WikiTokenizer.Callback {
final T parser;
StringBuilder builder;
IndexedEntry indexedEntry;
IndexBuilder indexBuilder;
final Map> functionCallbacks = new LinkedHashMap>();
boolean entryTypeNameSticks = false;
EntryTypeName entryTypeName = null;
final Map langCodeToTCount = new LinkedHashMap();
final NameAndArgs nameAndArgs = new NameAndArgs();
public AppendAndIndexWikiCallback(final T parser) {
this.parser = parser;
}
public void reset(final StringBuilder builder, final IndexedEntry indexedEntry) {
this.builder = builder;
this.indexedEntry = indexedEntry;
this.indexBuilder = null;
entryTypeName = null;
entryTypeNameSticks = false;
}
public void dispatch(final String wikiText, final IndexBuilder indexBuilder, final EntryTypeName entryTypeName) {
final IndexBuilder oldIndexBuilder = this.indexBuilder;
final EntryTypeName oldEntryTypeName = this.entryTypeName;
this.indexBuilder = indexBuilder;
if (!entryTypeNameSticks) {
this.entryTypeName = EnumUtil.min(entryTypeName, this.entryTypeName);
}
if (entryTypeName == null) this.entryTypeName = null;
WikiTokenizer.dispatch(wikiText, false, this);
this.indexBuilder = oldIndexBuilder;
this.entryTypeName = oldEntryTypeName;
}
public String dispatch(final String wikiText, final EntryTypeName entryTypeName) {
final int start = builder.length();
dispatch(wikiText, this.indexBuilder, entryTypeName);
return builder.substring(start);
}
@Override
public void onPlainText(final String plainText) {
// The only non-recursive callback. Just appends to the builder, and indexes.
builder.append(plainText);
if (indexBuilder != null && entryTypeName != null && indexedEntry != null) {
indexBuilder.addEntryWithString(indexedEntry, plainText, entryTypeName);
}
}
@Override
public void onWikiLink(WikiTokenizer wikiTokenizer) {
final String text = wikiTokenizer.wikiLinkText();
@SuppressWarnings("unused")
final String link = wikiTokenizer.wikiLinkDest();
dispatch(text, entryTypeName);
}
@Override
public void onFunction(
final WikiTokenizer wikiTokenizer,
final String name,
final List args,
final Map namedArgs) {
FunctionCallback functionCallback = functionCallbacks.get(name);
if (functionCallback == null || !functionCallback.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this)) {
// Default function handling:
parser.removeUselessArgs(namedArgs);
final boolean single = args.isEmpty() && namedArgs.isEmpty();
builder.append(single ? "{" : "{{");
final IndexBuilder oldIndexBuilder = indexBuilder;
indexBuilder = null;
nameAndArgs.onWikiFunction(wikiTokenizer, name, args, namedArgs, parser, this);
indexBuilder = oldIndexBuilder;
builder.append(single ? "}" : "}}");
}
}
@Override
public void onHtml(WikiTokenizer wikiTokenizer) {
if (wikiTokenizer.token().startsWith("[")) {
// Do nothing.
return;
}
// Unindexed for now.
builder.append(wikiTokenizer.token());
}
@Override
public void onMarkup(WikiTokenizer wikiTokenizer) {
// Do nothing.
}
@Override
public final void onComment(WikiTokenizer wikiTokenizer) {
// Do nothing.
}
@Override
public void onNewline(WikiTokenizer wikiTokenizer) {
assert false;
}
@Override
public void onHeading(WikiTokenizer wikiTokenizer) {
assert false;
}
@Override
public void onListItem(WikiTokenizer wikiTokenizer) {
assert false;
}
}
// --------------------------------------------------------------------
static final class NameAndArgs implements FunctionCallback {
@Override
public boolean onWikiFunction(final WikiTokenizer wikiTokenizer, final String name, final List args,
final Map namedArgs, final T parser,
final AppendAndIndexWikiCallback appendAndIndexWikiCallback) {
if (name != null) {
appendAndIndexWikiCallback.dispatch(name, null);
}
for (int i = 0; i < args.size(); ++i) {
if (args.get(i).length() > 0) {
appendAndIndexWikiCallback.builder.append("|");
appendAndIndexWikiCallback.dispatch(args.get(i), null, null);
}
}
appendNamedArgs(namedArgs, appendAndIndexWikiCallback);
return true;
}
}
static NameAndArgs NAME_AND_ARGS = new NameAndArgs();
static void appendNamedArgs(final Map namedArgs,
final AppendAndIndexWikiCallback> appendAndIndexWikiCallback) {
for (final Map.Entry entry : namedArgs.entrySet()) {
appendAndIndexWikiCallback.builder.append("|");
appendAndIndexWikiCallback.dispatch(entry.getKey(), null, null);
appendAndIndexWikiCallback.builder.append("=");
EntryTypeName entryTypeName = null;
IndexBuilder indexBuilder = null;
// This doesn't work: we'd need to add to word-forms.
// System.out.println(entry.getKey());
// if (entry.getKey().equals("tr")) {
// entryTypeName = EntryTypeName.WIKTIONARY_TRANSLITERATION;
// indexBuilder = appendAndIndexWikiCallback.parser.foreignIndexBuilder;
// }
appendAndIndexWikiCallback.dispatch(entry.getValue(), indexBuilder, entryTypeName);
}
}
}
]