X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=63e507ee9a97d9433efbc863f094480c5175dc96;hb=2182783b7ac6a22c23b37db4ba458ff12a6978dc;hp=53104fc166e862c39cb89b9ec4a814d0fdc50253;hpb=4df160f54810f0c7d279552f8b0531fc99a83a79;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 53104fc..63e507e 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,15 +1,19 @@ package com.hughes.android.dictionary.parser.wiktionary; +import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; -import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.text.StringEscapeUtils; +import java.net.URI; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; @@ -17,42 +21,392 @@ import java.util.regex.Pattern; public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public static final String NAME = "WholeSectionToHtmlParser"; - public static final Pattern skipSections = Pattern.compile(".*Translations.*"); + + interface LangConfig { + boolean skipSection(final String name); + EntryTypeName sectionNameToEntryType(String sectionName); + boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest, final String wikiLinkText); + void addFunctionCallbacks( + Map> functionCallbacks); + } + static final Map isoToLangConfig = new LinkedHashMap(); + static { + final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*"); + isoToLangConfig.put("EN", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return enSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyms")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonyms")) { + return EntryTypeName.ANTONYM_MULTI; + } + if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) { + // We need to put it in the other index, too (probably) + return null; + } + if (sectionName.equalsIgnoreCase("Derived Terms")) { + return null; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*"); + isoToLangConfig.put("ES", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Categoría:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Spanish variant + } + }); + + final Pattern ptSkipSections = Pattern.compile(".*Tradução.*"); + isoToLangConfig.put("PT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Categoria:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Portuguese variant + } + }); + + final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*"); + isoToLangConfig.put("DE", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return deSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyme")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Gegenwörter")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Kategorie:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + DeFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*"); + isoToLangConfig.put("IT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return itSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinonimi")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Categoria:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + ItFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + + final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*"); + isoToLangConfig.put("FR", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return frSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonymes")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonymes")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Catégorie:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + FrFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + } final IndexBuilder titleIndexBuilder; + final IndexBuilder defIndexBuilder; + final String skipLangIso; + final LangConfig langConfig; + final String webUrlTemplate; - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder) { - this.titleIndexBuilder = titleIndexBuilder; + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso, + final String webUrlTemplate) { + this.titleIndexBuilder = titleIndexBuilder; + this.defIndexBuilder = defIndexBuilder; + assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; + this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; + this.webUrlTemplate = webUrlTemplate; } + IndexedEntry indexedEntry = null; + @Override - void parseSection(String heading, String text) { - HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); - IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); + public void parseSection(String heading, String text) { + assert entrySource != null; + final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title); + indexedEntry = new IndexedEntry(htmlEntry); final AppendAndIndexWikiCallback callback = new AppendCallback( - this); + this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); callback.builder = new StringBuilder(); callback.indexedEntry = indexedEntry; callback.dispatch(text, null); + if (webUrlTemplate != null) { + final String webUrl = String.format(webUrlTemplate, title); + // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases. + try { + callback.builder.append(String.format("

%s", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl))); + } catch (Exception e) { + } + } htmlEntry.html = callback.builder.toString(); indexedEntry.isValid = true; final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); + tokenData.hasMainEntry = true; htmlEntry.addToDictionary(titleIndexBuilder.index.dict); tokenData.htmlEntries.add(htmlEntry); // titleIndexBuilder.addEntryWithString(indexedEntry, title, // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); + + indexedEntry = null; } @Override void removeUselessArgs(Map namedArgs) { } + @Override + public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) { + if (lang == null || lang.equals(skipLangIso)) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); + } + } + + public static String escapeHtmlLiteral(final String plainText) { + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (StringUtil.isAscii(htmlEscaped)) { + return htmlEscaped; + } else { + return StringUtil.escapeUnicodeToPureHtml(plainText); + } + + } + + + class AppendCallback extends AppendAndIndexWikiCallback { public AppendCallback(WholeSectionToHtmlParser parser) { super(parser); @@ -60,7 +414,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { @Override public void onPlainText(String plainText) { - super.onPlainText(StringEscapeUtils.escapeHtml3(plainText)); + super.onPlainText(escapeHtmlLiteral(plainText)); } @Override @@ -69,12 +423,34 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { // Skips wikilinks like: [[en::dick]] return; } - super.onWikiLink(wikiTokenizer); + if (langConfig.skipWikiLink(wikiTokenizer)) { + return; + } + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (sectionEntryTypeName != null) { + // TODO: inside a definition, this could be the wrong language. + titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName); + } + if (!StringUtil.isNullOrEmpty(linkDest)) { + builder.append(String.format("", HtmlEntry.formatQuickdicUrl("", linkDest))); + super.onWikiLink(wikiTokenizer); + builder.append(String.format("")); + } else { + super.onWikiLink(wikiTokenizer); + } } @Override public void onFunction(WikiTokenizer wikiTokenizer, String name, - List args, Map namedArgs) { + List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } super.onFunction(wikiTokenizer, name, args, namedArgs); } @@ -87,15 +463,23 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { public void onNewline(WikiTokenizer wikiTokenizer) { } + EntryTypeName sectionEntryTypeName; + IndexBuilder currentIndexBuilder; + @Override public void onHeading(WikiTokenizer wikiTokenizer) { final String headingText = wikiTokenizer.headingWikiText(); + sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText); final int depth = wikiTokenizer.headingDepth(); - if (skipSections.matcher(headingText).matches()) { + if (langConfig.skipSection(headingText)) { + //System.out.println("Skipping section:" + headingText); while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { + // System.out.println("Resume on: " + wikiTokenizer.token()); wikiTokenizer.returnToLineStart(); return; + } else { + // System.out.println("Skipped: " + wikiTokenizer.token()); } } return; @@ -115,7 +499,7 @@ public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { final String prefix = wikiTokenizer.listItemPrefix(); while (listPrefixStack.size() < prefix.length()) { builder.append(String.format("<%s>", - WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); + WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); listPrefixStack.add(prefix.charAt(listPrefixStack.size())); } builder.append("

  • ");