2 package com.hughes.android.dictionary.parser.wiktionary;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
9 import java.util.regex.Pattern;
11 import org.apache.commons.text.StringEscapeUtils;
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
23 public static final String NAME = "WholeSectionToHtmlParser";
25 interface LangConfig {
26 boolean skipSection(final String name);
27 EntryTypeName sectionNameToEntryType(String sectionName);
28 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29 String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30 void addFunctionCallbacks(
31 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
33 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
35 final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36 isoToLangConfig.put("EN", new LangConfig() {
38 public boolean skipSection(String headingText) {
39 return enSkipSections.matcher(headingText).matches();
43 public EntryTypeName sectionNameToEntryType(String sectionName) {
44 if (sectionName.equalsIgnoreCase("Synonyms")) {
45 return EntryTypeName.SYNONYM_MULTI;
47 if (sectionName.equalsIgnoreCase("Antonyms")) {
48 return EntryTypeName.ANTONYM_MULTI;
50 // We need to put it in the other index, too (probably) ?
51 // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
53 // Needs special handling?
54 // sectionName.equalsIgnoreCase("Derived Terms")
59 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
60 final String wikiText = wikiTokenizer.wikiLinkText();
61 return wikiText.startsWith("Category:");
64 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
65 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
68 final int hashPos = wikiLinkDest.indexOf("#");
70 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
71 if (wikiLinkDest.isEmpty()) {
72 wikiLinkDest = wikiLinkText;
79 public void addFunctionCallbacks(
80 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
81 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
85 final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
86 isoToLangConfig.put("ES", new LangConfig() {
88 public boolean skipSection(String headingText) {
89 return esSkipSections.matcher(headingText).matches();
93 public EntryTypeName sectionNameToEntryType(String sectionName) {
94 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
95 return EntryTypeName.SYNONYM_MULTI;
97 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
98 return EntryTypeName.ANTONYM_MULTI;
104 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
105 final String wikiText = wikiTokenizer.wikiLinkText();
106 return wikiText.startsWith("Categoría:");
109 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
110 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
113 final int hashPos = wikiLinkDest.indexOf("#");
115 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
116 if (wikiLinkDest.isEmpty()) {
117 wikiLinkDest = wikiLinkText;
124 public void addFunctionCallbacks(
125 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
126 // TODO: need Spanish variant
130 final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
131 isoToLangConfig.put("PT", new LangConfig() {
133 public boolean skipSection(String headingText) {
134 return esSkipSections.matcher(headingText).matches();
138 public EntryTypeName sectionNameToEntryType(String sectionName) {
139 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
140 return EntryTypeName.SYNONYM_MULTI;
142 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
143 return EntryTypeName.ANTONYM_MULTI;
149 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
150 final String wikiText = wikiTokenizer.wikiLinkText();
151 return wikiText.startsWith("Categoria:");
154 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
155 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
158 final int hashPos = wikiLinkDest.indexOf("#");
160 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
161 if (wikiLinkDest.isEmpty()) {
162 wikiLinkDest = wikiLinkText;
169 public void addFunctionCallbacks(
170 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
171 // TODO: need Portuguese variant
175 final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
176 isoToLangConfig.put("DE", new LangConfig() {
178 public boolean skipSection(String headingText) {
179 return deSkipSections.matcher(headingText).matches();
183 public EntryTypeName sectionNameToEntryType(String sectionName) {
184 if (sectionName.equalsIgnoreCase("Synonyme")) {
185 return EntryTypeName.SYNONYM_MULTI;
187 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
188 return EntryTypeName.ANTONYM_MULTI;
194 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
195 final String wikiText = wikiTokenizer.wikiLinkText();
196 return wikiText.startsWith("Kategorie:");
199 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
200 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
203 final int hashPos = wikiLinkDest.indexOf("#");
205 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
206 if (wikiLinkDest.isEmpty()) {
207 wikiLinkDest = wikiLinkText;
214 public void addFunctionCallbacks(
215 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
216 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
220 final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
221 isoToLangConfig.put("IT", new LangConfig() {
223 public boolean skipSection(String headingText) {
224 return itSkipSections.matcher(headingText).matches();
228 public EntryTypeName sectionNameToEntryType(String sectionName) {
229 if (sectionName.equalsIgnoreCase("Sinonimi")) {
230 return EntryTypeName.SYNONYM_MULTI;
232 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
233 return EntryTypeName.ANTONYM_MULTI;
239 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
240 final String wikiText = wikiTokenizer.wikiLinkText();
241 return wikiText.startsWith("Categoria:");
244 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
245 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
248 final int hashPos = wikiLinkDest.indexOf("#");
250 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
251 if (wikiLinkDest.isEmpty()) {
252 wikiLinkDest = wikiLinkText;
259 public void addFunctionCallbacks(
260 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
261 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
266 final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
267 isoToLangConfig.put("FR", new LangConfig() {
269 public boolean skipSection(String headingText) {
270 return frSkipSections.matcher(headingText).matches();
274 public EntryTypeName sectionNameToEntryType(String sectionName) {
275 if (sectionName.equalsIgnoreCase("Synonymes")) {
276 return EntryTypeName.SYNONYM_MULTI;
278 if (sectionName.equalsIgnoreCase("Antonymes")) {
279 return EntryTypeName.ANTONYM_MULTI;
285 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
286 final String wikiText = wikiTokenizer.wikiLinkText();
287 return wikiText.startsWith("Catégorie:");
290 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
291 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
294 final int hashPos = wikiLinkDest.indexOf("#");
296 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
297 if (wikiLinkDest.isEmpty()) {
298 wikiLinkDest = wikiLinkText;
305 public void addFunctionCallbacks(
306 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
307 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
312 final IndexBuilder titleIndexBuilder;
313 final IndexBuilder defIndexBuilder;
314 final String skipLangIso;
315 final LangConfig langConfig;
316 final String webUrlTemplate;
319 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
320 final String webUrlTemplate) {
321 this.titleIndexBuilder = titleIndexBuilder;
322 this.defIndexBuilder = defIndexBuilder;
323 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
324 this.langConfig = isoToLangConfig.get(wiktionaryIso);
325 this.skipLangIso = skipLangIso;
326 this.webUrlTemplate = webUrlTemplate;
329 IndexedEntry indexedEntry = null;
332 public void parseSection(String heading, String text) {
333 assert entrySource != null;
334 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
335 indexedEntry = new IndexedEntry(htmlEntry);
337 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
339 langConfig.addFunctionCallbacks(callback.functionCallbacks);
341 callback.builder = new StringBuilder();
342 callback.indexedEntry = indexedEntry;
343 callback.dispatch(text, null);
345 if (webUrlTemplate != null) {
346 final String webUrl = String.format(webUrlTemplate, title);
347 boolean success = true;
348 // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
350 String asciiWebUrl = URI.create(webUrl).toASCIIString();
351 } catch (Exception e) {
355 callback.builder.append("<p> <a href=\"");
356 callback.builder.append(asciiWebUrl);
357 callback.builder.append("\">");
358 callback.builder.append(escapeHtmlLiteral(webUrl));
359 callback.builder.append("</a>");
362 htmlEntry.html = callback.builder.toString();
363 indexedEntry.isValid = true;
365 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
366 tokenData.hasMainEntry = true;
368 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
369 tokenData.htmlEntries.add(htmlEntry);
370 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
371 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
377 void removeUselessArgs(Map<String, String> namedArgs) {
381 public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
382 if (lang == null || lang.equals(skipLangIso)) {
383 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
387 public static String escapeHtmlLiteral(final String plainText) {
388 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
389 if (StringUtil.isAscii(htmlEscaped)) {
392 return StringUtil.escapeUnicodeToPureHtml(plainText);
399 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
400 public AppendCallback(WholeSectionToHtmlParser parser) {
405 public void onPlainText(String plainText) {
406 super.onPlainText(escapeHtmlLiteral(plainText));
410 public void onWikiLink(WikiTokenizer wikiTokenizer) {
411 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
412 // Skips wikilinks like: [[en::dick]]
415 if (langConfig.skipWikiLink(wikiTokenizer)) {
419 if (wikiTokenizer.wikiLinkDest() != null) {
420 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
422 linkDest = wikiTokenizer.wikiLinkText();
424 if (sectionEntryTypeName != null) {
425 // TODO: inside a definition, this could be the wrong language.
426 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
428 if (!StringUtil.isNullOrEmpty(linkDest)) {
429 builder.append("<a href=\"");
430 builder.append(HtmlEntry.formatQuickdicUrl("", linkDest));
431 builder.append("\">");
432 super.onWikiLink(wikiTokenizer);
433 builder.append("</a>");
435 super.onWikiLink(wikiTokenizer);
440 public void onFunction(WikiTokenizer wikiTokenizer, String name,
441 List<String> args, Map<String, String> namedArgs) {
442 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
443 namedArgs.remove("lang");
445 super.onFunction(wikiTokenizer, name, args, namedArgs);
449 public void onHtml(WikiTokenizer wikiTokenizer) {
450 super.onHtml(wikiTokenizer);
454 public void onNewline(WikiTokenizer wikiTokenizer) {
457 EntryTypeName sectionEntryTypeName;
458 IndexBuilder currentIndexBuilder;
461 public void onHeading(WikiTokenizer wikiTokenizer) {
462 final String headingText = wikiTokenizer.headingWikiText();
463 sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
464 final int depth = wikiTokenizer.headingDepth();
465 if (langConfig.skipSection(headingText)) {
466 //System.out.println("Skipping section:" + headingText);
467 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
468 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
469 // System.out.println("Resume on: " + wikiTokenizer.token());
470 wikiTokenizer.returnToLineStart();
473 // System.out.println("Skipped: " + wikiTokenizer.token());
478 builder.append("\n<h");
479 builder.append(depth);
481 dispatch(headingText, null);
482 builder.append("</h");
483 builder.append(depth);
484 builder.append(">\n");
487 final List<Character> listPrefixStack = new ArrayList<>();
490 public void onListItem(WikiTokenizer wikiTokenizer) {
491 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
492 builder.append("\n");
494 final String prefix = wikiTokenizer.listItemPrefix();
495 while (listPrefixStack.size() < prefix.length()) {
497 builder.append(WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())));
499 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
501 builder.append("<li>");
502 dispatch(wikiTokenizer.listItemWikiText(), null);
503 builder.append("</li>\n");
505 WikiTokenizer nextToken = wikiTokenizer.nextToken();
506 boolean returnToLineStart = false;
507 if (nextToken != null && nextToken.isNewline()) {
508 nextToken = nextToken.nextToken();
509 returnToLineStart = true;
511 final String nextListHeader;
512 if (nextToken == null || !nextToken.isListItem()) {
515 nextListHeader = nextToken.listItemPrefix();
517 if (returnToLineStart) {
518 wikiTokenizer.returnToLineStart();
520 while (listPrefixStack.size() > nextListHeader.length()) {
521 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
522 builder.append("</");
523 builder.append(WikiTokenizer.getListTag(prefixChar));
524 builder.append(">\n");
528 boolean boldOn = false;
529 boolean italicOn = false;
532 public void onMarkup(WikiTokenizer wikiTokenizer) {
533 if ("'''".equals(wikiTokenizer.token())) {
535 builder.append("<b>");
537 builder.append("</b>");
540 } else if ("''".equals(wikiTokenizer.token())) {
542 builder.append("<em>");
544 builder.append("</em>");
546 italicOn = !italicOn;