X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fparser%2Fwiktionary%2FWholeSectionToHtmlParser.java;h=0a702d0bcb30125cd9872a244118cdfb0b3f0abc;hb=db5b09de08e526988f90f02d64a0c7e9af3b477d;hp=66ead9cb972c69996e65777e8a459edf386c4889;hpb=1e70d9b640da7def7ec0e70fad2325065bf17fa1;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java index 66ead9c..0a702d0 100644 --- a/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java +++ b/src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java @@ -1,162 +1,255 @@ + package com.hughes.android.dictionary.parser.wiktionary; +import com.hughes.android.dictionary.engine.EntryTypeName; import com.hughes.android.dictionary.engine.HtmlEntry; import com.hughes.android.dictionary.engine.IndexBuilder; import com.hughes.android.dictionary.engine.IndexBuilder.TokenData; import com.hughes.android.dictionary.engine.IndexedEntry; import com.hughes.android.dictionary.parser.WikiTokenizer; +import com.hughes.util.StringUtil; import org.apache.commons.lang3.StringEscapeUtils; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; public class WholeSectionToHtmlParser extends AbstractWiktionaryParser { - - public static final String NAME = "WholeSectionToHtmlParser"; - public static final Pattern skipSections = Pattern.compile(".*Translations.*"); - - final IndexBuilder titleIndexBuilder; - - public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder) { - this.titleIndexBuilder = titleIndexBuilder; - - } - @Override - void parseSection(String heading, String text) { - HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); - IndexedEntry indexedEntry = new IndexedEntry(htmlEntry); + public static final String NAME = "WholeSectionToHtmlParser"; - final AppendAndIndexWikiCallback callback = new AppendCallback(this); - - callback.builder = new StringBuilder(); - callback.indexedEntry = indexedEntry; - callback.dispatch(text, null); - - htmlEntry.html = callback.builder.toString(); - indexedEntry.isValid = true; - - final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); - - htmlEntry.addToDictionary(titleIndexBuilder.index.dict); - tokenData.htmlEntries.add(htmlEntry); - //titleIndexBuilder.addEntryWithString(indexedEntry, title, EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); - } - - @Override - void removeUselessArgs(Map namedArgs) { - } - - class AppendCallback extends AppendAndIndexWikiCallback { - public AppendCallback(WholeSectionToHtmlParser parser) { - super(parser); + interface LangConfig { + boolean skipSection(final String name); + boolean skipWikiLink(final WikiTokenizer wikiTokenizer); + String adjustWikiLink(String wikiLinkDest); + void addFunctionCallbacks( + Map> functionCallbacks); } - - @Override - public void onPlainText(String plainText) { - super.onPlainText(StringEscapeUtils.escapeHtml3(plainText)); + static final Map isoToLangConfig = new LinkedHashMap(); + static { + final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*"); + isoToLangConfig.put("EN", new LangConfig() { + @Override + public boolean skipSection(String headingText) { + return enSkipSections.matcher(headingText).matches(); + } + + @Override + public boolean skipWikiLink(WikiTokenizer wikiTokenizer) { + final String wikiText = wikiTokenizer.wikiLinkText(); + if (wikiText.startsWith("Category:")) { + return true; + } + return false; + } + @Override + public String adjustWikiLink(String wikiLinkDest) { + if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) { + return null; + } + return wikiLinkDest; + } + + @Override + public void addFunctionCallbacks( + Map> functionCallbacks) { + EnFunctionCallbacks.addGenericCallbacks(functionCallbacks); + }}); } - @Override - public void onWikiLink(WikiTokenizer wikiTokenizer) { - super.onWikiLink(wikiTokenizer); + final IndexBuilder titleIndexBuilder; + final String skipLangIso; + final LangConfig langConfig; + + public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) { + this.titleIndexBuilder = titleIndexBuilder; + assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso; + this.langConfig = isoToLangConfig.get(wiktionaryIso); + this.skipLangIso = skipLangIso; } + + IndexedEntry indexedEntry = null; @Override - public void onFunction(WikiTokenizer wikiTokenizer, String name, - List args, Map namedArgs) { - super.onFunction(wikiTokenizer, name, args, namedArgs); + public void parseSection(String heading, String text) { + assert entrySource != null; + final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title)); + indexedEntry = new IndexedEntry(htmlEntry); + + final AppendAndIndexWikiCallback callback = new AppendCallback( + this); + langConfig.addFunctionCallbacks(callback.functionCallbacks); + + callback.builder = new StringBuilder(); + callback.indexedEntry = indexedEntry; + callback.dispatch(text, null); + + htmlEntry.html = callback.builder.toString(); + indexedEntry.isValid = true; + + final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title); + + htmlEntry.addToDictionary(titleIndexBuilder.index.dict); + tokenData.htmlEntries.add(htmlEntry); + // titleIndexBuilder.addEntryWithString(indexedEntry, title, + // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL); + + indexedEntry = null; } @Override - public void onHtml(WikiTokenizer wikiTokenizer) { - super.onHtml(wikiTokenizer); + void removeUselessArgs(Map namedArgs) { } @Override - public void onNewline(WikiTokenizer wikiTokenizer) { + public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) { + titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName); } - @Override - public void onHeading(WikiTokenizer wikiTokenizer) { - final String headingText = wikiTokenizer.headingWikiText(); - final int depth = wikiTokenizer.headingDepth(); - if (skipSections.matcher(headingText).matches()) { - while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { - if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { - wikiTokenizer.returnToLineStart(); - return; - } + + + static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*"); + + class AppendCallback extends AppendAndIndexWikiCallback { + public AppendCallback(WholeSectionToHtmlParser parser) { + super(parser); } - return; - } - builder.append(String.format("\n", depth)); - dispatch(headingText, null); - builder.append(String.format("\n", depth)); - } - final List listPrefixStack = new ArrayList(); - @Override - public void onListItem(WikiTokenizer wikiTokenizer) { - if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') { - builder.append("\n"); - } - final String prefix = wikiTokenizer.listItemPrefix(); - while (listPrefixStack.size() < prefix.length()) { - builder.append(String.format("<%s>", WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); - listPrefixStack.add(prefix.charAt(listPrefixStack.size())); - } - builder.append("
  • "); - dispatch(wikiTokenizer.listItemWikiText(), null); - builder.append("
  • \n"); - - WikiTokenizer nextToken = wikiTokenizer.nextToken(); - boolean returnToLineStart = false; - if (nextToken != null && nextToken.isNewline()) { - nextToken = nextToken.nextToken(); - returnToLineStart = true; - } - final String nextListHeader; - if (nextToken == null || !nextToken.isListItem()) { - nextListHeader = ""; - } else { - nextListHeader = nextToken.listItemPrefix(); - } - if (returnToLineStart) { - wikiTokenizer.returnToLineStart(); - } - while (listPrefixStack.size() > nextListHeader.length()) { - final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); - builder.append(String.format("\n", WikiTokenizer.getListTag(prefixChar))); - } - } + @Override + public void onPlainText(String plainText) { + final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText); + if (ALL_ASCII.matcher(htmlEscaped).matches()) { + super.onPlainText(htmlEscaped); + } else { + super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText)); + } + } - boolean boldOn = false; - boolean italicOn = false; - @Override - public void onMarkup(WikiTokenizer wikiTokenizer) { - if ("'''".equals(wikiTokenizer.token())) { - if (!boldOn) { - builder.append(""); - } else { - builder.append(""); + @Override + public void onWikiLink(WikiTokenizer wikiTokenizer) { + if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) { + // Skips wikilinks like: [[en::dick]] + return; + } + if (langConfig.skipWikiLink(wikiTokenizer)) { + return; + } + String linkDest; + if (wikiTokenizer.wikiLinkDest() != null) { + linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest()); + } else { + linkDest = wikiTokenizer.wikiLinkText(); + } + if (linkDest != null) { + builder.append(String.format("", linkDest)); + super.onWikiLink(wikiTokenizer); + builder.append(String.format("")); + } else { + super.onWikiLink(wikiTokenizer); + } + } + + @Override + public void onFunction(WikiTokenizer wikiTokenizer, String name, + List args, Map namedArgs) { + if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) { + namedArgs.remove("lang"); + } + super.onFunction(wikiTokenizer, name, args, namedArgs); + } + + @Override + public void onHtml(WikiTokenizer wikiTokenizer) { + super.onHtml(wikiTokenizer); + } + + @Override + public void onNewline(WikiTokenizer wikiTokenizer) { + } + + @Override + public void onHeading(WikiTokenizer wikiTokenizer) { + final String headingText = wikiTokenizer.headingWikiText(); + final int depth = wikiTokenizer.headingDepth(); + if (langConfig.skipSection(headingText)) { + while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) { + if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) { + wikiTokenizer.returnToLineStart(); + return; + } + } + return; + } + builder.append(String.format("\n", depth)); + dispatch(headingText, null); + builder.append(String.format("\n", depth)); + } + + final List listPrefixStack = new ArrayList(); + + @Override + public void onListItem(WikiTokenizer wikiTokenizer) { + if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') { + builder.append("\n"); + } + final String prefix = wikiTokenizer.listItemPrefix(); + while (listPrefixStack.size() < prefix.length()) { + builder.append(String.format("<%s>", + WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())))); + listPrefixStack.add(prefix.charAt(listPrefixStack.size())); + } + builder.append("
  • "); + dispatch(wikiTokenizer.listItemWikiText(), null); + builder.append("
  • \n"); + + WikiTokenizer nextToken = wikiTokenizer.nextToken(); + boolean returnToLineStart = false; + if (nextToken != null && nextToken.isNewline()) { + nextToken = nextToken.nextToken(); + returnToLineStart = true; + } + final String nextListHeader; + if (nextToken == null || !nextToken.isListItem()) { + nextListHeader = ""; + } else { + nextListHeader = nextToken.listItemPrefix(); + } + if (returnToLineStart) { + wikiTokenizer.returnToLineStart(); + } + while (listPrefixStack.size() > nextListHeader.length()) { + final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1); + builder.append(String.format("\n", WikiTokenizer.getListTag(prefixChar))); + } } - boldOn = !boldOn; - } else if ("''".equals(wikiTokenizer.token())) { - if (!italicOn) { - builder.append(""); - } else { - builder.append(""); + + boolean boldOn = false; + boolean italicOn = false; + + @Override + public void onMarkup(WikiTokenizer wikiTokenizer) { + if ("'''".equals(wikiTokenizer.token())) { + if (!boldOn) { + builder.append(""); + } else { + builder.append(""); + } + boldOn = !boldOn; + } else if ("''".equals(wikiTokenizer.token())) { + if (!italicOn) { + builder.append(""); + } else { + builder.append(""); + } + italicOn = !italicOn; + } else { + assert false; + } } - italicOn = !italicOn; - } else { - assert false; - } + } - - } }