X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=57f4d587010c6d66d722f26acd8afb640daf293c;hb=750d808c256f16703f8b348df2d260c5ca0bd56d;hp=57f04e138aa2e5cb42d0d34406a32c0dfff7ad02;hpb=498f764042c3d1309930373af01d11060ea7daed;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 57f04e1..57f4d58 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -6,10 +6,13 @@ import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; @@ -17,22 +20,65 @@ import java.util.regex.Pattern; public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public static final String NAME = "WholeSectionToHtmlParser"; - public static final Pattern skipSections = Pattern.compile(".*Translations.*"); + + interface LangConfig { + boolean skipSection(final String name); + boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest); + void addFunctionCallbacks( + Map> functionCallbacks); + } + static final Map isoToLangConfig = new LinkedHashMap(); + static { + final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*"); + isoToLangConfig.put("EN", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return enSkipSections.matcher(headingText).matches(); + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + }}); + } final IndexBuilder titleIndexBuilder; + final String skipLangIso; + final LangConfig langConfig; - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder) { + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) { this.titleIndexBuilder = titleIndexBuilder; - + assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; + this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; } @Override - void parseSection(String heading, String text) { + public void parseSection(String heading, String text) { HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback( this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); callback.builder = new StringBuilder(); callback.indexedEntry = indexedEntry; @@ -52,6 +98,8 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override void removeUselessArgs(Map namedArgs) { } + + static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*"); class AppendCallback extends AppendAndIndexWikiCallback { public AppendCallback(WholeSectionToHtmlParser parser) { @@ -60,17 +108,44 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onPlainText(String plainText) { - super.onPlainText(StringEscapeUtils.escapeHtml3(plainText)); + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (ALL_ASCII.matcher(htmlEscaped).matches()) { + super.onPlainText(htmlEscaped); + } else { + super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText)); + } } @Override public void onWikiLink(WikiTokenizer wikiTokenizer) { - super.onWikiLink(wikiTokenizer); + if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) { + // Skips wikilinks like: [[en::dick]] + return; + } + if (langConfig.skipWikiLink(wikiTokenizer)) { + return; + } + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (linkDest != null) { + builder.append(String.format("", linkDest)); + super.onWikiLink(wikiTokenizer); + builder.append(String.format("")); + } else { + super.onWikiLink(wikiTokenizer); + } } @Override public void onFunction(WikiTokenizer wikiTokenizer, String name, List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } super.onFunction(wikiTokenizer, name, args, namedArgs); } @@ -87,7 +162,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public void onHeading(WikiTokenizer wikiTokenizer) { final String headingText = wikiTokenizer.headingWikiText(); final int depth = wikiTokenizer.headingDepth(); - if (skipSections.matcher(headingText).matches()) { + if (langConfig.skipSection(headingText)) { while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { wikiTokenizer.returnToLineStart();