X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=70a02c37f826433b9cda1a6626339ebf0dffce61;hb=cd86f5f7abf5e545daf89c03d9dbbfd4e5bc65a4;hp=dcf6f494132e7b98ef929f6e3bc223923fe482a8;hpb=0eaf1a63bc6d1145490b64d8c68e5a545401ec16;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index dcf6f49..70a02c3 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,25 +1,283 @@ + package com.hughes.android.dictionary.parser.wiktionary; +import com.hughes.android.dictionary.engine.EntryTypeName; +import com.hughes.android.dictionary.engine.HtmlEntry; +import com.hughes.android.dictionary.engine.IndexBuilder; +import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; +import com.hughes.android.dictionary.engine.IndexedEntry; +import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; + +import org.apache.commons.lang3.StringEscapeUtils; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.regex.Pattern; -import com.hughes.android.dictionary.engine.IndexBuilder; - public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { - - final IndexBuilder thisIndexBuilder; - final IndexBuilder foreignIndexBuilder; - final Pattern langPattern; - final Pattern langCodePattern; + public static final String NAME = "WholeSectionToHtmlParser"; + + interface LangConfig { + boolean skipSection(final String name); + boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest); + void addFunctionCallbacks( + Map> functionCallbacks); + } + static final Map isoToLangConfig = new LinkedHashMap(); + static { + final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*"); + isoToLangConfig.put("EN", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return enSkipSections.matcher(headingText).matches(); + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + }}); + + final LangConfig basicLangConfig = new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return false; + } - @Override - void parseSection(String heading, String text) { + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest) { + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + } + }; + isoToLangConfig.put("FR", basicLangConfig); + isoToLangConfig.put("DE", basicLangConfig); + isoToLangConfig.put("IT", basicLangConfig); + } + + final IndexBuilder titleIndexBuilder; + final String skipLangIso; + final LangConfig langConfig; + + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) { + this.titleIndexBuilder = titleIndexBuilder; + assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; + this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; + } - } + IndexedEntry indexedEntry = null; + + @Override + public void parseSection(String heading, String text) { + assert entrySource != null; + final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); + indexedEntry = new IndexedEntry(htmlEntry); + + final AppendAndIndexWikiCallback callback = new AppendCallback( + this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); + + callback.builder = new StringBuilder(); + callback.indexedEntry = indexedEntry; + callback.dispatch(text, null); + + htmlEntry.html = callback.builder.toString(); + indexedEntry.isValid = true; + + final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); + + htmlEntry.addToDictionary(titleIndexBuilder.index.dict); + tokenData.htmlEntries.add(htmlEntry); + // titleIndexBuilder.addEntryWithString(indexedEntry, title, + // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); + + indexedEntry = null; + } + + @Override + void removeUselessArgs(Map namedArgs) { + } + + @Override + public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); + } + + + + static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*"); + + class AppendCallback extends AppendAndIndexWikiCallback { + public AppendCallback(WholeSectionToHtmlParser parser) { + super(parser); + } + + @Override + public void onPlainText(String plainText) { + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (ALL_ASCII.matcher(htmlEscaped).matches()) { + super.onPlainText(htmlEscaped); + } else { + super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText)); + } + } + + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) { + // Skips wikilinks like: [[en::dick]] + return; + } + if (langConfig.skipWikiLink(wikiTokenizer)) { + return; + } + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (linkDest != null) { + builder.append(String.format("", linkDest)); + super.onWikiLink(wikiTokenizer); + builder.append(String.format("")); + } else { + super.onWikiLink(wikiTokenizer); + } + } + + @Override + public void onFunction(WikiTokenizer wikiTokenizer, String name, + List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } + super.onFunction(wikiTokenizer, name, args, namedArgs); + } + + @Override + public void onHtml(WikiTokenizer wikiTokenizer) { + super.onHtml(wikiTokenizer); + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + final String headingText = wikiTokenizer.headingWikiText(); + final int depth = wikiTokenizer.headingDepth(); + if (langConfig.skipSection(headingText)) { + while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { + if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { + wikiTokenizer.returnToLineStart(); + return; + } + } + return; + } + builder.append(String.format("\n", depth)); + dispatch(headingText, null); + builder.append(String.format("\n", depth)); + } + + final List listPrefixStack = new ArrayList(); + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') { + builder.append("\n"); + } + final String prefix = wikiTokenizer.listItemPrefix(); + while (listPrefixStack.size() < prefix.length()) { + builder.append(String.format("<%s>", + WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); + listPrefixStack.add(prefix.charAt(listPrefixStack.size())); + } + builder.append("
  • "); + dispatch(wikiTokenizer.listItemWikiText(), null); + builder.append("
  • \n"); + + WikiTokenizer nextToken = wikiTokenizer.nextToken(); + boolean returnToLineStart = false; + if (nextToken != null && nextToken.isNewline()) { + nextToken = nextToken.nextToken(); + returnToLineStart = true; + } + final String nextListHeader; + if (nextToken == null || !nextToken.isListItem()) { + nextListHeader = ""; + } else { + nextListHeader = nextToken.listItemPrefix(); + } + if (returnToLineStart) { + wikiTokenizer.returnToLineStart(); + } + while (listPrefixStack.size() > nextListHeader.length()) { + final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); + builder.append(String.format("\n", WikiTokenizer.getListTag(prefixChar))); + } + } + + boolean boldOn = false; + boolean italicOn = false; + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + if ("'''".equals(wikiTokenizer.token())) { + if (!boldOn) { + builder.append(""); + } else { + builder.append(""); + } + boldOn = !boldOn; + } else if ("''".equals(wikiTokenizer.token())) { + if (!italicOn) { + builder.append(""); + } else { + builder.append(""); + } + italicOn = !italicOn; + } else { + assert false; + } + } - @Override - void removeUselessArgs(Map namedArgs) { - } + } }