X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=b3249d12f395a81fb0393f4319b5d0a26104426f;hb=26ab537cbfd3e303f636d793fd55ea950dc8f5b2;hp=0f7ae2d1cef8d94876a6b97ad0627a7633ee4562;hpb=1b515f031d39e758e8e6339c03e124f1548579cc;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 0f7ae2d..b3249d1 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,7 +1,6 @@ package com.hughes.android.dictionary.parser.wiktionary; -import com.hughes.android.dictionary.HtmlDisplayActivity; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; @@ -9,6 +8,7 @@ import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; import com.hughes.util.StringUtil; +import com.sun.xml.internal.rngom.util.Uri; import org.apache.commons.lang3.StringEscapeUtils; @@ -32,7 +32,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } static final Map isoToLangConfig = new LinkedHashMap(); static { - final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*"); + final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*"); isoToLangConfig.put("EN", new LangConfig() { @Override public boolean skipSection(String headingText) { @@ -48,7 +48,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { return EntryTypeName.ANTONYM_MULTI; } if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) { - // We need to put it in the other index, too. + // We need to put it in the other index, too (probably) return null; } if (sectionName.equalsIgnoreCase("Derived Terms")) { @@ -87,6 +87,104 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } }); + final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*"); + isoToLangConfig.put("DE", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return deSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyme")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Gegenwörter")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("???Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + DeFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*"); + isoToLangConfig.put("IT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return itSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinonimi")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("???Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + ItFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + + final LangConfig basicLangConfig = new LangConfig() { @Override public boolean skipSection(String headingText) { @@ -115,22 +213,23 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } }; isoToLangConfig.put("FR", basicLangConfig); - isoToLangConfig.put("DE", basicLangConfig); - isoToLangConfig.put("IT", basicLangConfig); } final IndexBuilder titleIndexBuilder; final IndexBuilder defIndexBuilder; final String skipLangIso; final LangConfig langConfig; + final String webUrlTemplate; - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso) { + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso, + final String webUrlTemplate) { this.titleIndexBuilder = titleIndexBuilder; this.defIndexBuilder = defIndexBuilder; assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; this.langConfig = isoToLangConfig.get(wiktionaryIso); this.skipLangIso = skipLangIso; + this.webUrlTemplate = webUrlTemplate; } IndexedEntry indexedEntry = null; @@ -149,10 +248,15 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { callback.indexedEntry = indexedEntry; callback.dispatch(text, null); + if (webUrlTemplate != null) { + final String webUrl = String.format(webUrlTemplate, title); + callback.builder.append(String.format("

%s", Uri.escapeDisallowedChars(webUrl), escapeHtmlLiteral(webUrl))); + } htmlEntry.html = callback.builder.toString(); indexedEntry.isValid = true; final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); + tokenData.hasMainEntry = true; htmlEntry.addToDictionary(titleIndexBuilder.index.dict); tokenData.htmlEntries.add(htmlEntry); @@ -167,8 +271,20 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } @Override - public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) { - titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); + public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) { + if (lang == null || lang.equals(skipLangIso)) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); + } + } + + public static String escapeHtmlLiteral(final String plainText) { + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (StringUtil.isAscii(htmlEscaped)) { + return htmlEscaped; + } else { + return StringUtil.escapeToPureHtmlUnicode(plainText); + } + } @@ -180,12 +296,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onPlainText(String plainText) { - final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); - if (StringUtil.isAscii(htmlEscaped)) { - super.onPlainText(htmlEscaped); - } else { - super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText)); - } + super.onPlainText(escapeHtmlLiteral(plainText)); } @Override @@ -207,7 +318,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { // TODO: inside a definition, this could be the wrong language. titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName); } - if (linkDest != null) { + if (!StringUtil.isNullOrEmpty(linkDest)) { builder.append(String.format("", HtmlEntry.formatQuickdicUrl("", linkDest))); super.onWikiLink(wikiTokenizer); builder.append(String.format(""));