2 package com.hughes.android.dictionary.parser.wiktionary;
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import org.apache.commons.lang3.StringEscapeUtils;
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
16 import java.util.regex.Pattern;
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
20 public static final String NAME = "WholeSectionToHtmlParser";
22 interface LangConfig {
23 boolean skipSection(final String name);
24 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
25 String adjustWikiLink(String wikiLinkDest);
26 void addFunctionCallbacks(
27 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
29 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
31 final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
32 isoToLangConfig.put("EN", new LangConfig() {
34 public boolean skipSection(String headingText) {
35 return enSkipSections.matcher(headingText).matches();
39 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
40 final String wikiText = wikiTokenizer.wikiLinkText();
41 if (wikiText.startsWith("Category:")) {
47 public String adjustWikiLink(String wikiLinkDest) {
48 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
55 public void addFunctionCallbacks(
56 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
57 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
61 final IndexBuilder titleIndexBuilder;
62 final String skipLangIso;
63 final LangConfig langConfig;
65 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
66 this.titleIndexBuilder = titleIndexBuilder;
67 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
68 this.langConfig = isoToLangConfig.get(wiktionaryIso);
69 this.skipLangIso = skipLangIso;
73 void parseSection(String heading, String text) {
74 HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
75 IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
77 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
79 langConfig.addFunctionCallbacks(callback.functionCallbacks);
81 callback.builder = new StringBuilder();
82 callback.indexedEntry = indexedEntry;
83 callback.dispatch(text, null);
85 htmlEntry.html = callback.builder.toString();
86 indexedEntry.isValid = true;
88 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
90 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
91 tokenData.htmlEntries.add(htmlEntry);
92 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
93 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
97 void removeUselessArgs(Map<String, String> namedArgs) {
100 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
101 public AppendCallback(WholeSectionToHtmlParser parser) {
106 public void onPlainText(String plainText) {
107 super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
111 public void onWikiLink(WikiTokenizer wikiTokenizer) {
112 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
113 // Skips wikilinks like: [[en::dick]]
116 if (langConfig.skipWikiLink(wikiTokenizer)) {
120 if (wikiTokenizer.wikiLinkDest() != null) {
121 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
123 linkDest = wikiTokenizer.wikiLinkText();
125 if (linkDest != null) {
126 builder.append(String.format("<a href=\"%s\">", linkDest));
127 super.onWikiLink(wikiTokenizer);
128 builder.append(String.format("</a>"));
130 super.onWikiLink(wikiTokenizer);
135 public void onFunction(WikiTokenizer wikiTokenizer, String name,
136 List<String> args, Map<String, String> namedArgs) {
137 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
138 namedArgs.remove("lang");
140 super.onFunction(wikiTokenizer, name, args, namedArgs);
144 public void onHtml(WikiTokenizer wikiTokenizer) {
145 super.onHtml(wikiTokenizer);
149 public void onNewline(WikiTokenizer wikiTokenizer) {
153 public void onHeading(WikiTokenizer wikiTokenizer) {
154 final String headingText = wikiTokenizer.headingWikiText();
155 final int depth = wikiTokenizer.headingDepth();
156 if (langConfig.skipSection(headingText)) {
157 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
158 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
159 wikiTokenizer.returnToLineStart();
165 builder.append(String.format("\n<h%d>", depth));
166 dispatch(headingText, null);
167 builder.append(String.format("</h%d>\n", depth));
170 final List<Character> listPrefixStack = new ArrayList<Character>();
173 public void onListItem(WikiTokenizer wikiTokenizer) {
174 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
175 builder.append("\n");
177 final String prefix = wikiTokenizer.listItemPrefix();
178 while (listPrefixStack.size() < prefix.length()) {
179 builder.append(String.format("<%s>",
180 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
181 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
183 builder.append("<li>");
184 dispatch(wikiTokenizer.listItemWikiText(), null);
185 builder.append("</li>\n");
187 WikiTokenizer nextToken = wikiTokenizer.nextToken();
188 boolean returnToLineStart = false;
189 if (nextToken != null && nextToken.isNewline()) {
190 nextToken = nextToken.nextToken();
191 returnToLineStart = true;
193 final String nextListHeader;
194 if (nextToken == null || !nextToken.isListItem()) {
197 nextListHeader = nextToken.listItemPrefix();
199 if (returnToLineStart) {
200 wikiTokenizer.returnToLineStart();
202 while (listPrefixStack.size() > nextListHeader.length()) {
203 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
204 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
208 boolean boldOn = false;
209 boolean italicOn = false;
212 public void onMarkup(WikiTokenizer wikiTokenizer) {
213 if ("'''".equals(wikiTokenizer.token())) {
215 builder.append("<b>");
217 builder.append("</b>");
220 } else if ("''".equals(wikiTokenizer.token())) {
222 builder.append("<em>");
224 builder.append("</em>");
226 italicOn = !italicOn;