X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=2b719db747b3b5f64edf4c8f938d333c9da04073;hb=cd527412992b58b660a003862da6009f6122c162;hp=f38b5503086a5c27b988dcdaa93cab06525ba6c3;hpb=b371830273946c376e3e1ef4650a4b7215471a89;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index f38b550..2b719db 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,155 +1,553 @@ + package com.hughes.android.dictionary.parser.wiktionary; +import java.net.URI; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringEscapeUtils; +import org.apache.commons.text.StringEscapeUtils; import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { - - public static final String NAME = "WholeSectionToHtmlParser"; - public static final Pattern skipSections = Pattern.compile(".*Translations.*"); - - final IndexBuilder titleIndexBuilder; - - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder) { - this.titleIndexBuilder = titleIndexBuilder; - } - - @Override - void parseSection(String heading, String text) { - HtmlEntry htmlEntry = new HtmlEntry(entrySource, title); - IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); - - final AppendAndIndexWikiCallback callback = new AppendCallback(this); - callback.builder = new StringBuilder(); - callback.indexedEntry = indexedEntry; - callback.dispatch(text, null); - - htmlEntry.html = callback.builder.toString(); - indexedEntry.isValid = true; - titleIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); - } - - @Override - void removeUselessArgs(Map namedArgs) { - } - - class AppendCallback extends AppendAndIndexWikiCallback { - public AppendCallback(WholeSectionToHtmlParser parser) { - super(parser); - } - @Override - public void onPlainText(String plainText) { - super.onPlainText(StringEscapeUtils.escapeHtml3(plainText)); + public static final String NAME = "WholeSectionToHtmlParser"; + + interface LangConfig { + boolean skipSection(final String name); + EntryTypeName sectionNameToEntryType(String sectionName); + boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest, final String wikiLinkText); + void addFunctionCallbacks( + Map> functionCallbacks); } + static final Map isoToLangConfig = new LinkedHashMap<>(); + static { + final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*"); + isoToLangConfig.put("EN", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return enSkipSections.matcher(headingText).matches(); + } - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - super.onWikiLink(wikiTokenizer); + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyms")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonyms")) { + return EntryTypeName.ANTONYM_MULTI; + } + // We need to put it in the other index, too (probably) ? + // EnParser.partOfSpeechHeader.matcher(sectionName).matches() + + // Needs special handling? + // sectionName.equalsIgnoreCase("Derived Terms") + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Category:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*"); + isoToLangConfig.put("ES", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Categoría:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Spanish variant + } + }); + + final Pattern ptSkipSections = Pattern.compile(".*Tradução.*"); + isoToLangConfig.put("PT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return esSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Categoria:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + // TODO: need Portuguese variant + } + }); + + final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*"); + isoToLangConfig.put("DE", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return deSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonyme")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Gegenwörter")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Kategorie:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + DeFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*"); + isoToLangConfig.put("IT", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return itSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Sinonimi")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Categoria:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + ItFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); + + + final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*"); + isoToLangConfig.put("FR", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return frSkipSections.matcher(headingText).matches(); + } + + @Override + public EntryTypeName sectionNameToEntryType(String sectionName) { + if (sectionName.equalsIgnoreCase("Synonymes")) { + return EntryTypeName.SYNONYM_MULTI; + } + if (sectionName.equalsIgnoreCase("Antonymes")) { + return EntryTypeName.ANTONYM_MULTI; + } + return null; + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + return wikiText.startsWith("Catégorie:"); + } + @Override + public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + final int hashPos = wikiLinkDest.indexOf("#"); + if (hashPos != -1) { + wikiLinkDest = wikiLinkDest.substring(0, hashPos); + if (wikiLinkDest.isEmpty()) { + wikiLinkDest = wikiLinkText; + } + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + FrFunctionCallbacks.addGenericCallbacks(functionCallbacks); + } + }); } - @Override - public void onFunction(WikiTokenizer wikiTokenizer, String name, - List args, Map namedArgs) { - super.onFunction(wikiTokenizer, name, args, namedArgs); + final IndexBuilder titleIndexBuilder; + final IndexBuilder defIndexBuilder; + final String skipLangIso; + final LangConfig langConfig; + final String webUrlTemplate; + + + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso, + final String webUrlTemplate) { + this.titleIndexBuilder = titleIndexBuilder; + this.defIndexBuilder = defIndexBuilder; + assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; + this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; + this.webUrlTemplate = webUrlTemplate; } + IndexedEntry indexedEntry = null; + @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - super.onHtml(wikiTokenizer); + public void parseSection(String heading, String text) { + assert entrySource != null; + final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title); + indexedEntry = new IndexedEntry(htmlEntry); + + final AppendAndIndexWikiCallback callback = new AppendCallback( + this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); + + callback.builder = new StringBuilder(); + callback.indexedEntry = indexedEntry; + callback.dispatch(text, null); + + if (webUrlTemplate != null) { + final String webUrl = String.format(webUrlTemplate, title); + String asciiWebUrl = null; + // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases. + try { + asciiWebUrl = URI.create(webUrl).toASCIIString(); + } catch (Exception e) { + } + if (asciiWebUrl != null) { + callback.builder.append("

"); + callback.builder.append(escapeHtmlLiteral(webUrl)); + callback.builder.append(""); + } + } + htmlEntry.html = callback.builder.toString(); + indexedEntry.isValid = true; + + final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); + tokenData.hasMainEntry = true; + + htmlEntry.addToDictionary(titleIndexBuilder.index.dict); + tokenData.htmlEntries.add(htmlEntry); + // titleIndexBuilder.addEntryWithString(indexedEntry, title, + // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); + + indexedEntry = null; } - + @Override - public void onNewline(WikiTokenizer wikiTokenizer) { + void removeUselessArgs(Map namedArgs) { } @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - final String headingText = wikiTokenizer.headingWikiText(); - final int depth = wikiTokenizer.headingDepth(); - if (skipSections.matcher(headingText).matches()) { - while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { - if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { - wikiTokenizer.returnToLineStart(); - return; - } + public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) { + if (lang == null || lang.equals(skipLangIso)) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); } - return; - } - builder.append(String.format("\n", depth)); - dispatch(headingText, null); - builder.append(String.format("\n", depth)); } - final List listPrefixStack = new ArrayList(); - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') { - builder.append("\n"); - } - final String prefix = wikiTokenizer.listItemPrefix(); - while (listPrefixStack.size() < prefix.length()) { - builder.append(String.format("<%s>", WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); - listPrefixStack.add(prefix.charAt(listPrefixStack.size())); - } - builder.append("

  • "); - dispatch(wikiTokenizer.listItemWikiText(), null); - builder.append("
  • \n"); - - WikiTokenizer nextToken = wikiTokenizer.nextToken(); - boolean returnToLineStart = false; - if (nextToken != null && nextToken.isNewline()) { - nextToken = nextToken.nextToken(); - returnToLineStart = true; - } - final String nextListHeader; - if (nextToken == null || !nextToken.isListItem()) { - nextListHeader = ""; - } else { - nextListHeader = nextToken.listItemPrefix(); - } - if (returnToLineStart) { - wikiTokenizer.returnToLineStart(); - } - while (listPrefixStack.size() > nextListHeader.length()) { - final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); - builder.append(String.format("\n", WikiTokenizer.getListTag(prefixChar))); - } + public static String escapeHtmlLiteral(final String plainText) { + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (StringUtil.isAscii(htmlEscaped)) { + return htmlEscaped; + } else { + return StringUtil.escapeUnicodeToPureHtml(plainText); + } + } - boolean boldOn = false; - boolean italicOn = false; - @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - if ("'''".equals(wikiTokenizer.token())) { - if (!boldOn) { - builder.append(""); - } else { - builder.append(""); + + + class AppendCallback extends AppendAndIndexWikiCallback { + public AppendCallback(WholeSectionToHtmlParser parser) { + super(parser); } - boldOn = !boldOn; - } else if ("''".equals(wikiTokenizer.token())) { - if (!italicOn) { - builder.append(""); - } else { - builder.append(""); + + @Override + public void onPlainText(String plainText) { + super.onPlainText(escapeHtmlLiteral(plainText)); + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) { + // Skips wikilinks like: [[en::dick]] + return; + } + if (langConfig.skipWikiLink(wikiTokenizer)) { + return; + } + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (sectionEntryTypeName != null) { + // TODO: inside a definition, this could be the wrong language. + titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName); + } + if (!StringUtil.isNullOrEmpty(linkDest)) { + builder.append(""); + super.onWikiLink(wikiTokenizer); + builder.append(""); + } else { + super.onWikiLink(wikiTokenizer); + } } - italicOn = !italicOn; - } else { - assert false; - } + + @Override + public void onFunction(WikiTokenizer wikiTokenizer, String name, + List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } + super.onFunction(wikiTokenizer, name, args, namedArgs); + } + + @Override + public void onHtml(WikiTokenizer wikiTokenizer) { + super.onHtml(wikiTokenizer); + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + } + + EntryTypeName sectionEntryTypeName; + IndexBuilder currentIndexBuilder; + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + final String headingText = wikiTokenizer.headingWikiText(); + sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText); + final int depth = wikiTokenizer.headingDepth(); + if (langConfig.skipSection(headingText)) { + //System.out.println("Skipping section:" + headingText); + while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { + if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { + // System.out.println("Resume on: " + wikiTokenizer.token()); + wikiTokenizer.returnToLineStart(); + return; + } else { + // System.out.println("Skipped: " + wikiTokenizer.token()); + } + } + return; + } + builder.append("\n'); + dispatch(headingText, null); + builder.append("\n"); + } + + final List listPrefixStack = new ArrayList<>(); + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') { + builder.append("\n"); + } + final String prefix = wikiTokenizer.listItemPrefix(); + while (listPrefixStack.size() < prefix.length()) { + builder.append('<'); + builder.append(WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))); + builder.append('>'); + listPrefixStack.add(prefix.charAt(listPrefixStack.size())); + } + builder.append("
  • "); + dispatch(wikiTokenizer.listItemWikiText(), null); + builder.append("
  • \n"); + + WikiTokenizer nextToken = wikiTokenizer.nextToken(); + boolean returnToLineStart = false; + if (nextToken != null && nextToken.isNewline()) { + nextToken = nextToken.nextToken(); + returnToLineStart = true; + } + final String nextListHeader; + if (nextToken == null || !nextToken.isListItem()) { + nextListHeader = ""; + } else { + nextListHeader = nextToken.listItemPrefix(); + } + if (returnToLineStart) { + wikiTokenizer.returnToLineStart(); + } + while (listPrefixStack.size() > nextListHeader.length()) { + final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); + builder.append("\n"); + } + } + + boolean boldOn = false; + boolean italicOn = false; + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + if ("'''".equals(wikiTokenizer.token())) { + if (!boldOn) { + builder.append(""); + } else { + builder.append(""); + } + boldOn = !boldOn; + } else if ("''".equals(wikiTokenizer.token())) { + if (!italicOn) { + builder.append(""); + } else { + builder.append(""); + } + italicOn = !italicOn; + } else { + assert false; + } + } + } - - } }