2 package com.hughes.android.dictionary.parser.wiktionary;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
9 import java.util.regex.Pattern;
11 import org.apache.commons.text.StringEscapeUtils;
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
23 public static final String NAME = "WholeSectionToHtmlParser";
25 interface LangConfig {
26 boolean skipSection(final String name);
27 EntryTypeName sectionNameToEntryType(String sectionName);
28 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29 String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30 void addFunctionCallbacks(
31 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
33 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
35 final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36 isoToLangConfig.put("EN", new LangConfig() {
38 public boolean skipSection(String headingText) {
39 return enSkipSections.matcher(headingText).matches();
43 public EntryTypeName sectionNameToEntryType(String sectionName) {
44 if (sectionName.equalsIgnoreCase("Synonyms")) {
45 return EntryTypeName.SYNONYM_MULTI;
47 if (sectionName.equalsIgnoreCase("Antonyms")) {
48 return EntryTypeName.ANTONYM_MULTI;
50 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51 // We need to put it in the other index, too (probably)
54 if (sectionName.equalsIgnoreCase("Derived Terms")) {
61 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62 final String wikiText = wikiTokenizer.wikiLinkText();
63 if (wikiText.startsWith("Category:")) {
69 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
73 final int hashPos = wikiLinkDest.indexOf("#");
75 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76 if (wikiLinkDest.isEmpty()) {
77 wikiLinkDest = wikiLinkText;
84 public void addFunctionCallbacks(
85 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
90 final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
91 isoToLangConfig.put("ES", new LangConfig() {
93 public boolean skipSection(String headingText) {
94 return esSkipSections.matcher(headingText).matches();
98 public EntryTypeName sectionNameToEntryType(String sectionName) {
99 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
100 return EntryTypeName.SYNONYM_MULTI;
102 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
103 return EntryTypeName.ANTONYM_MULTI;
109 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
110 final String wikiText = wikiTokenizer.wikiLinkText();
111 if (wikiText.startsWith("Categoría:")) {
117 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
118 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
121 final int hashPos = wikiLinkDest.indexOf("#");
123 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
124 if (wikiLinkDest.isEmpty()) {
125 wikiLinkDest = wikiLinkText;
132 public void addFunctionCallbacks(
133 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
134 // TODO: need Spanish variant
138 final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
139 isoToLangConfig.put("PT", new LangConfig() {
141 public boolean skipSection(String headingText) {
142 return esSkipSections.matcher(headingText).matches();
146 public EntryTypeName sectionNameToEntryType(String sectionName) {
147 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
148 return EntryTypeName.SYNONYM_MULTI;
150 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
151 return EntryTypeName.ANTONYM_MULTI;
157 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
158 final String wikiText = wikiTokenizer.wikiLinkText();
159 if (wikiText.startsWith("Categoria:")) {
165 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
166 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
169 final int hashPos = wikiLinkDest.indexOf("#");
171 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
172 if (wikiLinkDest.isEmpty()) {
173 wikiLinkDest = wikiLinkText;
180 public void addFunctionCallbacks(
181 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
182 // TODO: need Portuguese variant
186 final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
187 isoToLangConfig.put("DE", new LangConfig() {
189 public boolean skipSection(String headingText) {
190 return deSkipSections.matcher(headingText).matches();
194 public EntryTypeName sectionNameToEntryType(String sectionName) {
195 if (sectionName.equalsIgnoreCase("Synonyme")) {
196 return EntryTypeName.SYNONYM_MULTI;
198 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
199 return EntryTypeName.ANTONYM_MULTI;
205 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
206 final String wikiText = wikiTokenizer.wikiLinkText();
207 if (wikiText.startsWith("Kategorie:")) {
213 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
214 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
217 final int hashPos = wikiLinkDest.indexOf("#");
219 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
220 if (wikiLinkDest.isEmpty()) {
221 wikiLinkDest = wikiLinkText;
228 public void addFunctionCallbacks(
229 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
230 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
234 final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
235 isoToLangConfig.put("IT", new LangConfig() {
237 public boolean skipSection(String headingText) {
238 return itSkipSections.matcher(headingText).matches();
242 public EntryTypeName sectionNameToEntryType(String sectionName) {
243 if (sectionName.equalsIgnoreCase("Sinonimi")) {
244 return EntryTypeName.SYNONYM_MULTI;
246 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
247 return EntryTypeName.ANTONYM_MULTI;
253 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
254 final String wikiText = wikiTokenizer.wikiLinkText();
255 if (wikiText.startsWith("Categoria:")) {
261 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
262 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
265 final int hashPos = wikiLinkDest.indexOf("#");
267 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
268 if (wikiLinkDest.isEmpty()) {
269 wikiLinkDest = wikiLinkText;
276 public void addFunctionCallbacks(
277 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
278 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
283 final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
284 isoToLangConfig.put("FR", new LangConfig() {
286 public boolean skipSection(String headingText) {
287 return frSkipSections.matcher(headingText).matches();
291 public EntryTypeName sectionNameToEntryType(String sectionName) {
292 if (sectionName.equalsIgnoreCase("Synonymes")) {
293 return EntryTypeName.SYNONYM_MULTI;
295 if (sectionName.equalsIgnoreCase("Antonymes")) {
296 return EntryTypeName.ANTONYM_MULTI;
302 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
303 final String wikiText = wikiTokenizer.wikiLinkText();
304 if (wikiText.startsWith("Catégorie:")) {
310 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
311 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
314 final int hashPos = wikiLinkDest.indexOf("#");
316 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
317 if (wikiLinkDest.isEmpty()) {
318 wikiLinkDest = wikiLinkText;
325 public void addFunctionCallbacks(
326 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
327 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
332 final IndexBuilder titleIndexBuilder;
333 final IndexBuilder defIndexBuilder;
334 final String skipLangIso;
335 final LangConfig langConfig;
336 final String webUrlTemplate;
339 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
340 final String webUrlTemplate) {
341 this.titleIndexBuilder = titleIndexBuilder;
342 this.defIndexBuilder = defIndexBuilder;
343 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
344 this.langConfig = isoToLangConfig.get(wiktionaryIso);
345 this.skipLangIso = skipLangIso;
346 this.webUrlTemplate = webUrlTemplate;
349 IndexedEntry indexedEntry = null;
352 public void parseSection(String heading, String text) {
353 assert entrySource != null;
354 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
355 indexedEntry = new IndexedEntry(htmlEntry);
357 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
359 langConfig.addFunctionCallbacks(callback.functionCallbacks);
361 callback.builder = new StringBuilder();
362 callback.indexedEntry = indexedEntry;
363 callback.dispatch(text, null);
365 if (webUrlTemplate != null) {
366 final String webUrl = String.format(webUrlTemplate, title);
367 // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
369 callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
370 } catch (Exception e) {
373 htmlEntry.html = callback.builder.toString();
374 indexedEntry.isValid = true;
376 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
377 tokenData.hasMainEntry = true;
379 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
380 tokenData.htmlEntries.add(htmlEntry);
381 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
382 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
388 void removeUselessArgs(Map<String, String> namedArgs) {
392 public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
393 if (lang == null || lang.equals(skipLangIso)) {
394 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
398 public static String escapeHtmlLiteral(final String plainText) {
399 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
400 if (StringUtil.isAscii(htmlEscaped)) {
403 return StringUtil.escapeUnicodeToPureHtml(plainText);
410 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
411 public AppendCallback(WholeSectionToHtmlParser parser) {
416 public void onPlainText(String plainText) {
417 super.onPlainText(escapeHtmlLiteral(plainText));
421 public void onWikiLink(WikiTokenizer wikiTokenizer) {
422 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
423 // Skips wikilinks like: [[en::dick]]
426 if (langConfig.skipWikiLink(wikiTokenizer)) {
430 if (wikiTokenizer.wikiLinkDest() != null) {
431 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
433 linkDest = wikiTokenizer.wikiLinkText();
435 if (sectionEntryTypeName != null) {
436 // TODO: inside a definition, this could be the wrong language.
437 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
439 if (!StringUtil.isNullOrEmpty(linkDest)) {
440 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
441 super.onWikiLink(wikiTokenizer);
442 builder.append(String.format("</a>"));
444 super.onWikiLink(wikiTokenizer);
449 public void onFunction(WikiTokenizer wikiTokenizer, String name,
450 List<String> args, Map<String, String> namedArgs) {
451 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
452 namedArgs.remove("lang");
454 super.onFunction(wikiTokenizer, name, args, namedArgs);
458 public void onHtml(WikiTokenizer wikiTokenizer) {
459 super.onHtml(wikiTokenizer);
463 public void onNewline(WikiTokenizer wikiTokenizer) {
466 EntryTypeName sectionEntryTypeName;
467 IndexBuilder currentIndexBuilder;
470 public void onHeading(WikiTokenizer wikiTokenizer) {
471 final String headingText = wikiTokenizer.headingWikiText();
472 sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
473 final int depth = wikiTokenizer.headingDepth();
474 if (langConfig.skipSection(headingText)) {
475 //System.out.println("Skipping section:" + headingText);
476 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
477 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
478 // System.out.println("Resume on: " + wikiTokenizer.token());
479 wikiTokenizer.returnToLineStart();
482 // System.out.println("Skipped: " + wikiTokenizer.token());
487 builder.append(String.format("\n<h%d>", depth));
488 dispatch(headingText, null);
489 builder.append(String.format("</h%d>\n", depth));
492 final List<Character> listPrefixStack = new ArrayList<Character>();
495 public void onListItem(WikiTokenizer wikiTokenizer) {
496 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
497 builder.append("\n");
499 final String prefix = wikiTokenizer.listItemPrefix();
500 while (listPrefixStack.size() < prefix.length()) {
501 builder.append(String.format("<%s>",
502 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
503 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
505 builder.append("<li>");
506 dispatch(wikiTokenizer.listItemWikiText(), null);
507 builder.append("</li>\n");
509 WikiTokenizer nextToken = wikiTokenizer.nextToken();
510 boolean returnToLineStart = false;
511 if (nextToken != null && nextToken.isNewline()) {
512 nextToken = nextToken.nextToken();
513 returnToLineStart = true;
515 final String nextListHeader;
516 if (nextToken == null || !nextToken.isListItem()) {
519 nextListHeader = nextToken.listItemPrefix();
521 if (returnToLineStart) {
522 wikiTokenizer.returnToLineStart();
524 while (listPrefixStack.size() > nextListHeader.length()) {
525 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
526 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
530 boolean boldOn = false;
531 boolean italicOn = false;
534 public void onMarkup(WikiTokenizer wikiTokenizer) {
535 if ("'''".equals(wikiTokenizer.token())) {
537 builder.append("<b>");
539 builder.append("</b>");
542 } else if ("''".equals(wikiTokenizer.token())) {
544 builder.append("<em>");
546 builder.append("</em>");
548 italicOn = !italicOn;