2 package com.hughes.android.dictionary.parser.wiktionary;
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
12 import org.apache.commons.lang3.StringEscapeUtils;
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
18 import java.util.regex.Pattern;
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22 public static final String NAME = "WholeSectionToHtmlParser";
24 interface LangConfig {
25 boolean skipSection(final String name);
26 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
27 String adjustWikiLink(String wikiLinkDest);
28 void addFunctionCallbacks(
29 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
31 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
33 final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
34 isoToLangConfig.put("EN", new LangConfig() {
36 public boolean skipSection(String headingText) {
37 return enSkipSections.matcher(headingText).matches();
41 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
42 final String wikiText = wikiTokenizer.wikiLinkText();
43 if (wikiText.startsWith("Category:")) {
49 public String adjustWikiLink(String wikiLinkDest) {
50 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
57 public void addFunctionCallbacks(
58 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
59 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
62 final LangConfig basicLangConfig = new LangConfig() {
64 public boolean skipSection(String headingText) {
69 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
70 final String wikiText = wikiTokenizer.wikiLinkText();
71 if (wikiText.startsWith("Category:")) {
77 public String adjustWikiLink(String wikiLinkDest) {
82 public void addFunctionCallbacks(
83 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86 isoToLangConfig.put("FR", basicLangConfig);
87 isoToLangConfig.put("DE", basicLangConfig);
88 isoToLangConfig.put("IT", basicLangConfig);
91 final IndexBuilder titleIndexBuilder;
92 final String skipLangIso;
93 final LangConfig langConfig;
95 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
96 this.titleIndexBuilder = titleIndexBuilder;
97 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
98 this.langConfig = isoToLangConfig.get(wiktionaryIso);
99 this.skipLangIso = skipLangIso;
102 IndexedEntry indexedEntry = null;
105 public void parseSection(String heading, String text) {
106 assert entrySource != null;
107 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
108 indexedEntry = new IndexedEntry(htmlEntry);
110 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
112 langConfig.addFunctionCallbacks(callback.functionCallbacks);
114 callback.builder = new StringBuilder();
115 callback.indexedEntry = indexedEntry;
116 callback.dispatch(text, null);
118 htmlEntry.html = callback.builder.toString();
119 indexedEntry.isValid = true;
121 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
123 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
124 tokenData.htmlEntries.add(htmlEntry);
125 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
126 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
132 void removeUselessArgs(Map<String, String> namedArgs) {
136 public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
137 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
142 static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
144 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
145 public AppendCallback(WholeSectionToHtmlParser parser) {
150 public void onPlainText(String plainText) {
151 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
152 if (ALL_ASCII.matcher(htmlEscaped).matches()) {
153 super.onPlainText(htmlEscaped);
155 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
160 public void onWikiLink(WikiTokenizer wikiTokenizer) {
161 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
162 // Skips wikilinks like: [[en::dick]]
165 if (langConfig.skipWikiLink(wikiTokenizer)) {
169 if (wikiTokenizer.wikiLinkDest() != null) {
170 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
172 linkDest = wikiTokenizer.wikiLinkText();
174 if (linkDest != null) {
175 builder.append(String.format("<a href=\"%s\">", linkDest));
176 super.onWikiLink(wikiTokenizer);
177 builder.append(String.format("</a>"));
179 super.onWikiLink(wikiTokenizer);
184 public void onFunction(WikiTokenizer wikiTokenizer, String name,
185 List<String> args, Map<String, String> namedArgs) {
186 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
187 namedArgs.remove("lang");
189 super.onFunction(wikiTokenizer, name, args, namedArgs);
193 public void onHtml(WikiTokenizer wikiTokenizer) {
194 super.onHtml(wikiTokenizer);
198 public void onNewline(WikiTokenizer wikiTokenizer) {
202 public void onHeading(WikiTokenizer wikiTokenizer) {
203 final String headingText = wikiTokenizer.headingWikiText();
204 final int depth = wikiTokenizer.headingDepth();
205 if (langConfig.skipSection(headingText)) {
206 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
207 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
208 wikiTokenizer.returnToLineStart();
214 builder.append(String.format("\n<h%d>", depth));
215 dispatch(headingText, null);
216 builder.append(String.format("</h%d>\n", depth));
219 final List<Character> listPrefixStack = new ArrayList<Character>();
222 public void onListItem(WikiTokenizer wikiTokenizer) {
223 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
224 builder.append("\n");
226 final String prefix = wikiTokenizer.listItemPrefix();
227 while (listPrefixStack.size() < prefix.length()) {
228 builder.append(String.format("<%s>",
229 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
230 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
232 builder.append("<li>");
233 dispatch(wikiTokenizer.listItemWikiText(), null);
234 builder.append("</li>\n");
236 WikiTokenizer nextToken = wikiTokenizer.nextToken();
237 boolean returnToLineStart = false;
238 if (nextToken != null && nextToken.isNewline()) {
239 nextToken = nextToken.nextToken();
240 returnToLineStart = true;
242 final String nextListHeader;
243 if (nextToken == null || !nextToken.isListItem()) {
246 nextListHeader = nextToken.listItemPrefix();
248 if (returnToLineStart) {
249 wikiTokenizer.returnToLineStart();
251 while (listPrefixStack.size() > nextListHeader.length()) {
252 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
253 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
257 boolean boldOn = false;
258 boolean italicOn = false;
261 public void onMarkup(WikiTokenizer wikiTokenizer) {
262 if ("'''".equals(wikiTokenizer.token())) {
264 builder.append("<b>");
266 builder.append("</b>");
269 } else if ("''".equals(wikiTokenizer.token())) {
271 builder.append("<em>");
273 builder.append("</em>");
275 italicOn = !italicOn;