X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=2b719db747b3b5f64edf4c8f938d333c9da04073;hb=cd527412992b58b660a003862da6009f6122c162;hp=c5dca8098fb782519905fc8a8af79ae13e9a3f13;hpb=9feb475642b11d6d6289868c17c7649f28b06368;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index c5dca80..2b719db 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,6 +1,15 @@ package com.hughes.android.dictionary.parser.wiktionary; +import java.net.URI; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.commons.text.StringEscapeUtils; + import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; @@ -9,15 +18,6 @@ import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; import com.hughes.util.StringUtil; -import org.apache.commons.lang3.StringEscapeUtils; - -import java.net.URI; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; - public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public static final String NAME = "WholeSectionToHtmlParser"; @@ -28,9 +28,9 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { boolean skipWikiLink(final WikiTokenizer wikiTokenizer); String adjustWikiLink(String wikiLinkDest, final String wikiLinkText); void addFunctionCallbacks( - Map> functionCallbacks); + Map> functionCallbacks); } - static final Map isoToLangConfig = new LinkedHashMap(); + static final Map isoToLangConfig = new LinkedHashMap<>(); static { final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*"); isoToLangConfig.put("EN", new LangConfig() { @@ -38,7 +38,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public boolean skipSection(String headingText) { return enSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonyms")) { @@ -47,23 +47,108 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (sectionName.equalsIgnoreCase("Antonyms")) { return EntryTypeName.ANTONYM_MULTI; } - if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) { - // We need to put it in the other index, too (probably) + // We need to put it in the other index, too (probably) ? + // EnParser.partOfSpeechHeader.matcher(sectionName).matches() + + // Needs special handling? + // sectionName.equalsIgnoreCase("Derived Terms") + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Category:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { return null; } - if (sectionName.equalsIgnoreCase("Derived Terms")) { - return null; + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*"); + isoToLangConfig.put("ES", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) { + return EntryTypeName.ANTONYM_MULTI; } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("Category:")) { - return true; + return wikiText.startsWith("Categoría:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Spanish variant + } + }); + + final Pattern ptSkipSections = Pattern.compile(".*Tradução.*"); + isoToLangConfig.put("PT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) { + return EntryTypeName.ANTONYM_MULTI; } - return false; + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Categoria:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -82,18 +167,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { - EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + Map> functionCallbacks) { + // TODO: need Portuguese variant } }); - + final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*"); isoToLangConfig.put("DE", new LangConfig() { @Override public boolean skipSection(String headingText) { return deSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonyme")) { @@ -104,14 +189,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("???Category:")) { - return true; - } - return false; + return wikiText.startsWith("Kategorie:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -130,18 +212,18 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { DeFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); - + final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*"); isoToLangConfig.put("IT", new LangConfig() { @Override public boolean skipSection(String headingText) { return itSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Sinonimi")) { @@ -152,14 +234,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { final String wikiText = wikiTokenizer.wikiLinkText(); - if (wikiText.startsWith("???Category:")) { - return true; - } - return false; + return wikiText.startsWith("Categoria:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -178,30 +257,34 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { ItFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); - final Pattern frSkipSections = Pattern.compile(".*(Traductions).*"); + final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*"); isoToLangConfig.put("FR", new LangConfig() { @Override public boolean skipSection(String headingText) { return frSkipSections.matcher(headingText).matches(); } - + @Override public EntryTypeName sectionNameToEntryType(String sectionName) { if (sectionName.equalsIgnoreCase("Synonymes")) { return EntryTypeName.SYNONYM_MULTI; } + if (sectionName.equalsIgnoreCase("Antonymes")) { + return EntryTypeName.ANTONYM_MULTI; + } return null; } - + @Override public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { - return false; + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Catégorie:"); } @Override public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { @@ -220,7 +303,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void addFunctionCallbacks( - Map> functionCallbacks) { + Map> functionCallbacks) { FrFunctionCallbacks.addGenericCallbacks(functionCallbacks); } }); @@ -231,10 +314,10 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { final String skipLangIso; final LangConfig langConfig; final String webUrlTemplate; - + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso, - final String webUrlTemplate) { + final String webUrlTemplate) { this.titleIndexBuilder = titleIndexBuilder; this.defIndexBuilder = defIndexBuilder; assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; @@ -242,7 +325,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { this.skipLangIso = skipLangIso; this.webUrlTemplate = webUrlTemplate; } - + IndexedEntry indexedEntry = null; @Override @@ -252,7 +335,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback( - this); + this); langConfig.addFunctionCallbacks(callback.functionCallbacks); callback.builder = new StringBuilder(); @@ -261,7 +344,19 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { if (webUrlTemplate != null) { final String webUrl = String.format(webUrlTemplate, title); - callback.builder.append(String.format("

%s", URI.create(webUrl).toString(), escapeHtmlLiteral(webUrl))); + String asciiWebUrl = null; + // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases. + try { + asciiWebUrl = URI.create(webUrl).toASCIIString(); + } catch (Exception e) { + } + if (asciiWebUrl != null) { + callback.builder.append("

"); + callback.builder.append(escapeHtmlLiteral(webUrl)); + callback.builder.append(""); + } } htmlEntry.html = callback.builder.toString(); indexedEntry.isValid = true; @@ -273,26 +368,26 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { tokenData.htmlEntries.add(htmlEntry); // titleIndexBuilder.addEntryWithString(indexedEntry, title, // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); - + indexedEntry = null; } @Override void removeUselessArgs(Map namedArgs) { } - + @Override public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) { if (lang == null || lang.equals(skipLangIso)) { titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); } } - + public static String escapeHtmlLiteral(final String plainText) { final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); if (StringUtil.isAscii(htmlEscaped)) { return htmlEscaped; - } else { + } else { return StringUtil.escapeUnicodeToPureHtml(plainText); } @@ -330,9 +425,11 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName); } if (!StringUtil.isNullOrEmpty(linkDest)) { - builder.append(String.format("", HtmlEntry.formatQuickdicUrl("", linkDest))); + builder.append(""); super.onWikiLink(wikiTokenizer); - builder.append(String.format("")); + builder.append(""); } else { super.onWikiLink(wikiTokenizer); } @@ -340,7 +437,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onFunction(WikiTokenizer wikiTokenizer, String name, - List args, Map namedArgs) { + List args, Map namedArgs) { if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { namedArgs.remove("lang"); } @@ -355,7 +452,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onNewline(WikiTokenizer wikiTokenizer) { } - + EntryTypeName sectionEntryTypeName; IndexBuilder currentIndexBuilder; @@ -377,12 +474,16 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } return; } - builder.append(String.format("\n", depth)); + builder.append("\n'); dispatch(headingText, null); - builder.append(String.format("\n", depth)); + builder.append("\n"); } - final List listPrefixStack = new ArrayList(); + final List listPrefixStack = new ArrayList<>(); @Override public void onListItem(WikiTokenizer wikiTokenizer) { @@ -391,8 +492,9 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } final String prefix = wikiTokenizer.listItemPrefix(); while (listPrefixStack.size() < prefix.length()) { - builder.append(String.format("<%s>", - WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); + builder.append('<'); + builder.append(WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))); + builder.append('>'); listPrefixStack.add(prefix.charAt(listPrefixStack.size())); } builder.append("

  • "); @@ -416,7 +518,9 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { } while (listPrefixStack.size() > nextListHeader.length()) { final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); - builder.append(String.format("\n", WikiTokenizer.getListTag(prefixChar))); + builder.append("\n"); } }