X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=0f7ae2d1cef8d94876a6b97ad0627a7633ee4562;hb=1b515f031d39e758e8e6339c03e124f1548579cc;hp=ceeb4c2395532f5e6630968976eec01fc088683f;hpb=22f584bdc1bd3cf68d3c375888a13676aa3ced2f;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index ceeb4c2..0f7ae2d 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,11 +1,14 @@ package com.hughes.android.dictionary.parser.wiktionary; +import com.hughes.android.dictionary.HtmlDisplayActivity; +import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; import org.apache.commons.lang3.StringEscapeUtils; @@ -16,20 +19,83 @@ import java.util.Map; import java.util.regex.Pattern; public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { - + + public static final String NAME = "WholeSectionToHtmlParser"; + interface LangConfig { boolean skipSection(final String name); + EntryTypeName sectionNameToEntryType(String sectionName); boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest, final String wikiLinkText); + void addFunctionCallbacks( + Map> functionCallbacks); } static final Map isoToLangConfig = new LinkedHashMap(); static { - final Pattern enSkipSections = Pattern.compile(".*Translations.*"); + final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*"); isoToLangConfig.put("EN", new LangConfig() { @Override public boolean skipSection(String headingText) { return enSkipSections.matcher(headingText).matches(); } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyms")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonyms")) { + return EntryTypeName.ANTONYM_MULTI; + } + if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) { + // We need to put it in the other index, too. + return null; + } + if (sectionName.equalsIgnoreCase("Derived Terms")) { + return null; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final LangConfig basicLangConfig = new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return false; + } + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + return EntryTypeName.WIKTIONARY_MENTIONED; + } @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); @@ -37,27 +103,47 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { return true; } return false; - }}); - } + } + @Override + public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) { + return wikiLinkDest; + } - public static final String NAME = "WholeSectionToHtmlParser"; + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + } + }; + isoToLangConfig.put("FR", basicLangConfig); + isoToLangConfig.put("DE", basicLangConfig); + isoToLangConfig.put("IT", basicLangConfig); + } final IndexBuilder titleIndexBuilder; + final IndexBuilder defIndexBuilder; + final String skipLangIso; final LangConfig langConfig; + - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso) { + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso) { this.titleIndexBuilder = titleIndexBuilder; + this.defIndexBuilder = defIndexBuilder; assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; } + + IndexedEntry indexedEntry = null; @Override - void parseSection(String heading, String text) { - HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); - IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); + public void parseSection(String heading, String text) { + assert entrySource != null; + final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title); + indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback( this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); callback.builder = new StringBuilder(); callback.indexedEntry = indexedEntry; @@ -72,11 +158,20 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { tokenData.htmlEntries.add(htmlEntry); // titleIndexBuilder.addEntryWithString(indexedEntry, title, // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); + + indexedEntry = null; } @Override void removeUselessArgs(Map namedArgs) { } + + @Override + public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); + } + + class AppendCallback extends AppendAndIndexWikiCallback { public AppendCallback(WholeSectionToHtmlParser parser) { @@ -85,7 +180,12 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onPlainText(String plainText) { - super.onPlainText(StringEscapeUtils.escapeHtml3(plainText)); + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (StringUtil.isAscii(htmlEscaped)) { + super.onPlainText(htmlEscaped); + } else { + super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText)); + } } @Override @@ -97,12 +197,31 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (langConfig.skipWikiLink(wikiTokenizer)) { return; } - super.onWikiLink(wikiTokenizer); + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (sectionEntryTypeName != null) { + // TODO: inside a definition, this could be the wrong language. + titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName); + } + if (linkDest != null) { + builder.append(String.format("", HtmlEntry.formatQuickdicUrl("", linkDest))); + super.onWikiLink(wikiTokenizer); + builder.append(String.format("")); + } else { + super.onWikiLink(wikiTokenizer); + } } @Override public void onFunction(WikiTokenizer wikiTokenizer, String name, List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } super.onFunction(wikiTokenizer, name, args, namedArgs); } @@ -114,10 +233,14 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onNewline(WikiTokenizer wikiTokenizer) { } + + EntryTypeName sectionEntryTypeName; + IndexBuilder currentIndexBuilder; @Override public void onHeading(WikiTokenizer wikiTokenizer) { final String headingText = wikiTokenizer.headingWikiText(); + sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText); final int depth = wikiTokenizer.headingDepth(); if (langConfig.skipSection(headingText)) { while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {