import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
+import com.hughes.android.dictionary.engine.EntrySource;
import com.hughes.android.dictionary.engine.EntryTypeName;
import com.hughes.android.dictionary.engine.IndexBuilder;
import com.hughes.android.dictionary.engine.IndexedEntry;
public class EnWiktionaryXmlParser {
- private static final String TRANSLITERATION_FORMAT = " (tr. %s)";
-
static final Logger LOG = Logger.getLogger(EnWiktionaryXmlParser.class.getName());
// TODO: process {{ttbc}} lines
"Particle|Interjection|Pronominal adverb" +
"Han character|Hanzi|Hanja|Kanji|Katakana character|Syllable");
+ EntrySource entrySource;
final IndexBuilder enIndexBuilder;
final IndexBuilder foreignIndexBuilder;
final Pattern langPattern;
}
- public void parse(final File file, final int pageLimit) throws IOException {
+ public void parse(final File file, final EntrySource entrySource, final int pageLimit) throws IOException {
+ this.entrySource = entrySource;
int pageCount = 0;
final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(file)));
+ try {
while (true) {
if (pageLimit >= 0 && pageCount >= pageLimit) {
return;
try {
title = dis.readUTF();
} catch (EOFException e) {
- LOG.warning("Error reading split!");
+ LOG.log(Level.INFO, "EOF reading split.");
dis.close();
return;
}
LOG.info("pageCount=" + pageCount);
}
}
+ } finally {
+ System.out.println("lang Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
+ appendAndIndexWikiCallback.langCodeToTCount.keySet().removeAll(EnWiktionaryLangs.isoCodeToWikiName.keySet());
+ System.out.println("unused Counts: " + appendAndIndexWikiCallback.langCodeToTCount);
+ }
}
private void parseSection(String heading, final String text) {
}
} else if (wikiTokenizer.isFunction()) {
final String name = wikiTokenizer.functionName();
- if (name.equals("head")) {
+ if (name.equals("head") && pos == null) {
LOG.warning("{{head}} without POS: " + title);
}
}
private void doTranslationLine(final String line, final String lang, final String pos, final String sense, final String rest) {
state = State.TRANSLATION_LINE;
// Good chance we'll actually file this one...
- final PairEntry pairEntry = new PairEntry();
+ final PairEntry pairEntry = new PairEntry(entrySource);
final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
final StringBuilder foreignText = new StringBuilder();
if (!pairsAdded.add(pair.toString())) {
LOG.warning("Duplicate pair: " + pair.toString());
}
- if (pair.toString().equals("libero {m} :: free (adjective)")) {
- System.out.println();
- }
-
}
final Collection<String> wordForms = new ArrayList<String>();
boolean titleAppended = false;
-
private void doForeignPartOfSpeech(final String lang, String posHeading, final int posDepth, WikiTokenizer wikiTokenizer) {
if (++foreignCount % 1000 == 0) {
LOG.info("***" + lang + ", " + title + ", pos=" + posHeading + ", foreignCount=" + foreignCount);
foreign = String.format("(%s) %s", lang, foreign);
}
for (final ListSection listSection : listSections) {
- doForeignListItem(foreign, title, wordForms, listSection);
+ doForeignListSection(foreign, title, wordForms, listSection);
}
}
}
"sc",
"sort",
"cat",
- "xs"));
+ "xs",
+ "nodot"));
+
+ public boolean entryIsFormOfSomething = false;
- private void doForeignListItem(final String foreignText, String title, final Collection<String> forms, final ListSection listSection) {
+ private void doForeignListSection(final String foreignText, String title, final Collection<String> forms, final ListSection listSection) {
state = State.ENGLISH_DEF_OF_FOREIGN;
final String prefix = listSection.firstPrefix;
if (prefix.length() > 1) {
return;
}
- final PairEntry pairEntry = new PairEntry();
+ final PairEntry pairEntry = new PairEntry(entrySource);
final IndexedEntry indexedEntry = new IndexedEntry(pairEntry);
-
+
+ entryIsFormOfSomething = false;
final StringBuilder englishBuilder = new StringBuilder();
final String mainLine = listSection.firstLine;
-
appendAndIndexWikiCallback.reset(englishBuilder, indexedEntry);
appendAndIndexWikiCallback.dispatch(mainLine, enIndexBuilder, EntryTypeName.WIKTIONARY_ENGLISH_DEF);
if (english.length() > 0) {
final Pair pair = new Pair(english, trim(foreignText), this.swap);
pairEntry.pairs.add(pair);
- foreignIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI);
+ foreignIndexBuilder.addEntryWithString(indexedEntry, title, entryIsFormOfSomething ? EntryTypeName.WIKTIONARY_IS_FORM_OF_SOMETHING_ELSE : EntryTypeName.WIKTIONARY_TITLE_MULTI);
for (final String form : forms) {
foreignIndexBuilder.addEntryWithString(indexedEntry, form, EntryTypeName.WIKTIONARY_INFLECTED_FORM_MULTI);
}