2 package com.hughes.android.dictionary.parser.wiktionary;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
9 import java.util.regex.Pattern;
11 import org.apache.commons.text.StringEscapeUtils;
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
23 public static final String NAME = "WholeSectionToHtmlParser";
25 interface LangConfig {
26 boolean skipSection(final String name);
27 EntryTypeName sectionNameToEntryType(String sectionName);
28 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29 String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30 void addFunctionCallbacks(
31 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
33 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
35 final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36 isoToLangConfig.put("EN", new LangConfig() {
38 public boolean skipSection(String headingText) {
39 return enSkipSections.matcher(headingText).matches();
43 public EntryTypeName sectionNameToEntryType(String sectionName) {
44 if (sectionName.equalsIgnoreCase("Synonyms")) {
45 return EntryTypeName.SYNONYM_MULTI;
47 if (sectionName.equalsIgnoreCase("Antonyms")) {
48 return EntryTypeName.ANTONYM_MULTI;
50 // We need to put it in the other index, too (probably) ?
51 // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
53 // Needs special handling?
54 // sectionName.equalsIgnoreCase("Derived Terms")
59 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
60 final String wikiText = wikiTokenizer.wikiLinkText();
61 return wikiText.startsWith("Category:");
64 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
65 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
68 final int hashPos = wikiLinkDest.indexOf("#");
70 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
71 if (wikiLinkDest.isEmpty()) {
72 wikiLinkDest = wikiLinkText;
79 public void addFunctionCallbacks(
80 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
81 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
85 final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
86 isoToLangConfig.put("ES", new LangConfig() {
88 public boolean skipSection(String headingText) {
89 return esSkipSections.matcher(headingText).matches();
93 public EntryTypeName sectionNameToEntryType(String sectionName) {
94 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
95 return EntryTypeName.SYNONYM_MULTI;
97 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
98 return EntryTypeName.ANTONYM_MULTI;
104 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
105 final String wikiText = wikiTokenizer.wikiLinkText();
106 return wikiText.startsWith("Categoría:");
109 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
110 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
113 final int hashPos = wikiLinkDest.indexOf("#");
115 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
116 if (wikiLinkDest.isEmpty()) {
117 wikiLinkDest = wikiLinkText;
124 public void addFunctionCallbacks(
125 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
126 // TODO: need Spanish variant
130 final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
131 isoToLangConfig.put("PT", new LangConfig() {
133 public boolean skipSection(String headingText) {
134 return esSkipSections.matcher(headingText).matches();
138 public EntryTypeName sectionNameToEntryType(String sectionName) {
139 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
140 return EntryTypeName.SYNONYM_MULTI;
142 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
143 return EntryTypeName.ANTONYM_MULTI;
149 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
150 final String wikiText = wikiTokenizer.wikiLinkText();
151 return wikiText.startsWith("Categoria:");
154 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
155 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
158 final int hashPos = wikiLinkDest.indexOf("#");
160 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
161 if (wikiLinkDest.isEmpty()) {
162 wikiLinkDest = wikiLinkText;
169 public void addFunctionCallbacks(
170 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
171 // TODO: need Portuguese variant
175 final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
176 isoToLangConfig.put("DE", new LangConfig() {
178 public boolean skipSection(String headingText) {
179 return deSkipSections.matcher(headingText).matches();
183 public EntryTypeName sectionNameToEntryType(String sectionName) {
184 if (sectionName.equalsIgnoreCase("Synonyme")) {
185 return EntryTypeName.SYNONYM_MULTI;
187 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
188 return EntryTypeName.ANTONYM_MULTI;
194 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
195 final String wikiText = wikiTokenizer.wikiLinkText();
196 return wikiText.startsWith("Kategorie:");
199 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
200 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
203 final int hashPos = wikiLinkDest.indexOf("#");
205 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
206 if (wikiLinkDest.isEmpty()) {
207 wikiLinkDest = wikiLinkText;
214 public void addFunctionCallbacks(
215 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
216 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
220 final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
221 isoToLangConfig.put("IT", new LangConfig() {
223 public boolean skipSection(String headingText) {
224 return itSkipSections.matcher(headingText).matches();
228 public EntryTypeName sectionNameToEntryType(String sectionName) {
229 if (sectionName.equalsIgnoreCase("Sinonimi")) {
230 return EntryTypeName.SYNONYM_MULTI;
232 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
233 return EntryTypeName.ANTONYM_MULTI;
239 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
240 final String wikiText = wikiTokenizer.wikiLinkText();
241 return wikiText.startsWith("Categoria:");
244 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
245 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
248 final int hashPos = wikiLinkDest.indexOf("#");
250 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
251 if (wikiLinkDest.isEmpty()) {
252 wikiLinkDest = wikiLinkText;
259 public void addFunctionCallbacks(
260 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
261 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
266 final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
267 isoToLangConfig.put("FR", new LangConfig() {
269 public boolean skipSection(String headingText) {
270 return frSkipSections.matcher(headingText).matches();
274 public EntryTypeName sectionNameToEntryType(String sectionName) {
275 if (sectionName.equalsIgnoreCase("Synonymes")) {
276 return EntryTypeName.SYNONYM_MULTI;
278 if (sectionName.equalsIgnoreCase("Antonymes")) {
279 return EntryTypeName.ANTONYM_MULTI;
285 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
286 final String wikiText = wikiTokenizer.wikiLinkText();
287 return wikiText.startsWith("Catégorie:");
290 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
291 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
294 final int hashPos = wikiLinkDest.indexOf("#");
296 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
297 if (wikiLinkDest.isEmpty()) {
298 wikiLinkDest = wikiLinkText;
305 public void addFunctionCallbacks(
306 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
307 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
312 final IndexBuilder titleIndexBuilder;
313 final IndexBuilder defIndexBuilder;
314 final String skipLangIso;
315 final LangConfig langConfig;
316 final String webUrlTemplate;
319 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
320 final String webUrlTemplate) {
321 this.titleIndexBuilder = titleIndexBuilder;
322 this.defIndexBuilder = defIndexBuilder;
323 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
324 this.langConfig = isoToLangConfig.get(wiktionaryIso);
325 this.skipLangIso = skipLangIso;
326 this.webUrlTemplate = webUrlTemplate;
329 IndexedEntry indexedEntry = null;
332 public void parseSection(String heading, String text) {
333 assert entrySource != null;
334 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
335 indexedEntry = new IndexedEntry(htmlEntry);
337 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
339 langConfig.addFunctionCallbacks(callback.functionCallbacks);
341 callback.builder = new StringBuilder();
342 callback.indexedEntry = indexedEntry;
343 callback.dispatch(text, null);
345 if (webUrlTemplate != null) {
346 final String webUrl = String.format(webUrlTemplate, title);
347 // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
349 callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
350 } catch (Exception e) {
353 htmlEntry.html = callback.builder.toString();
354 indexedEntry.isValid = true;
356 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
357 tokenData.hasMainEntry = true;
359 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
360 tokenData.htmlEntries.add(htmlEntry);
361 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
362 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
368 void removeUselessArgs(Map<String, String> namedArgs) {
372 public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
373 if (lang == null || lang.equals(skipLangIso)) {
374 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
378 public static String escapeHtmlLiteral(final String plainText) {
379 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
380 if (StringUtil.isAscii(htmlEscaped)) {
383 return StringUtil.escapeUnicodeToPureHtml(plainText);
390 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
391 public AppendCallback(WholeSectionToHtmlParser parser) {
396 public void onPlainText(String plainText) {
397 super.onPlainText(escapeHtmlLiteral(plainText));
401 public void onWikiLink(WikiTokenizer wikiTokenizer) {
402 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
403 // Skips wikilinks like: [[en::dick]]
406 if (langConfig.skipWikiLink(wikiTokenizer)) {
410 if (wikiTokenizer.wikiLinkDest() != null) {
411 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
413 linkDest = wikiTokenizer.wikiLinkText();
415 if (sectionEntryTypeName != null) {
416 // TODO: inside a definition, this could be the wrong language.
417 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
419 if (!StringUtil.isNullOrEmpty(linkDest)) {
420 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
421 super.onWikiLink(wikiTokenizer);
422 builder.append("</a>");
424 super.onWikiLink(wikiTokenizer);
429 public void onFunction(WikiTokenizer wikiTokenizer, String name,
430 List<String> args, Map<String, String> namedArgs) {
431 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
432 namedArgs.remove("lang");
434 super.onFunction(wikiTokenizer, name, args, namedArgs);
438 public void onHtml(WikiTokenizer wikiTokenizer) {
439 super.onHtml(wikiTokenizer);
443 public void onNewline(WikiTokenizer wikiTokenizer) {
446 EntryTypeName sectionEntryTypeName;
447 IndexBuilder currentIndexBuilder;
450 public void onHeading(WikiTokenizer wikiTokenizer) {
451 final String headingText = wikiTokenizer.headingWikiText();
452 sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
453 final int depth = wikiTokenizer.headingDepth();
454 if (langConfig.skipSection(headingText)) {
455 //System.out.println("Skipping section:" + headingText);
456 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
457 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
458 // System.out.println("Resume on: " + wikiTokenizer.token());
459 wikiTokenizer.returnToLineStart();
462 // System.out.println("Skipped: " + wikiTokenizer.token());
467 builder.append(String.format("\n<h%d>", depth));
468 dispatch(headingText, null);
469 builder.append(String.format("</h%d>\n", depth));
472 final List<Character> listPrefixStack = new ArrayList<>();
475 public void onListItem(WikiTokenizer wikiTokenizer) {
476 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
477 builder.append("\n");
479 final String prefix = wikiTokenizer.listItemPrefix();
480 while (listPrefixStack.size() < prefix.length()) {
481 builder.append(String.format("<%s>",
482 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
483 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
485 builder.append("<li>");
486 dispatch(wikiTokenizer.listItemWikiText(), null);
487 builder.append("</li>\n");
489 WikiTokenizer nextToken = wikiTokenizer.nextToken();
490 boolean returnToLineStart = false;
491 if (nextToken != null && nextToken.isNewline()) {
492 nextToken = nextToken.nextToken();
493 returnToLineStart = true;
495 final String nextListHeader;
496 if (nextToken == null || !nextToken.isListItem()) {
499 nextListHeader = nextToken.listItemPrefix();
501 if (returnToLineStart) {
502 wikiTokenizer.returnToLineStart();
504 while (listPrefixStack.size() > nextListHeader.length()) {
505 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
506 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
510 boolean boldOn = false;
511 boolean italicOn = false;
514 public void onMarkup(WikiTokenizer wikiTokenizer) {
515 if ("'''".equals(wikiTokenizer.token())) {
517 builder.append("<b>");
519 builder.append("</b>");
522 } else if ("''".equals(wikiTokenizer.token())) {
524 builder.append("<em>");
526 builder.append("</em>");
528 italicOn = !italicOn;