]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Minor automated code simplifications.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import java.net.URI;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.regex.Pattern;
10
11 import org.apache.commons.text.StringEscapeUtils;
12
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31             Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 // We need to put it in the other index, too (probably) ?
51                 // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
52
53                 // Needs special handling?
54                 // sectionName.equalsIgnoreCase("Derived Terms")
55                 return null;
56             }
57
58             @Override
59             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
60                 final String wikiText = wikiTokenizer.wikiLinkText();
61                 return wikiText.startsWith("Category:");
62             }
63             @Override
64             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
65                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
66                     return null;
67                 }
68                 final int hashPos = wikiLinkDest.indexOf("#");
69                 if (hashPos != -1) {
70                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
71                     if (wikiLinkDest.isEmpty()) {
72                         wikiLinkDest = wikiLinkText;
73                     }
74                 }
75                 return wikiLinkDest;
76             }
77
78             @Override
79             public void addFunctionCallbacks(
80                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
81                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
82             }
83         });
84
85         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
86         isoToLangConfig.put("ES", new LangConfig() {
87             @Override
88             public boolean skipSection(String headingText) {
89                 return esSkipSections.matcher(headingText).matches();
90             }
91
92             @Override
93             public EntryTypeName sectionNameToEntryType(String sectionName) {
94                 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
95                     return EntryTypeName.SYNONYM_MULTI;
96                 }
97                 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
98                     return EntryTypeName.ANTONYM_MULTI;
99                 }
100                 return null;
101             }
102
103             @Override
104             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
105                 final String wikiText = wikiTokenizer.wikiLinkText();
106                 return wikiText.startsWith("Categoría:");
107             }
108             @Override
109             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
110                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
111                     return null;
112                 }
113                 final int hashPos = wikiLinkDest.indexOf("#");
114                 if (hashPos != -1) {
115                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
116                     if (wikiLinkDest.isEmpty()) {
117                         wikiLinkDest = wikiLinkText;
118                     }
119                 }
120                 return wikiLinkDest;
121             }
122
123             @Override
124             public void addFunctionCallbacks(
125                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
126                 // TODO: need Spanish variant
127             }
128         });
129
130         final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
131         isoToLangConfig.put("PT", new LangConfig() {
132             @Override
133             public boolean skipSection(String headingText) {
134                 return esSkipSections.matcher(headingText).matches();
135             }
136
137             @Override
138             public EntryTypeName sectionNameToEntryType(String sectionName) {
139                 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
140                     return EntryTypeName.SYNONYM_MULTI;
141                 }
142                 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
143                     return EntryTypeName.ANTONYM_MULTI;
144                 }
145                 return null;
146             }
147
148             @Override
149             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
150                 final String wikiText = wikiTokenizer.wikiLinkText();
151                 return wikiText.startsWith("Categoria:");
152             }
153             @Override
154             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
155                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
156                     return null;
157                 }
158                 final int hashPos = wikiLinkDest.indexOf("#");
159                 if (hashPos != -1) {
160                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
161                     if (wikiLinkDest.isEmpty()) {
162                         wikiLinkDest = wikiLinkText;
163                     }
164                 }
165                 return wikiLinkDest;
166             }
167
168             @Override
169             public void addFunctionCallbacks(
170                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
171                 // TODO: need Portuguese variant
172             }
173         });
174
175         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
176         isoToLangConfig.put("DE", new LangConfig() {
177             @Override
178             public boolean skipSection(String headingText) {
179                 return deSkipSections.matcher(headingText).matches();
180             }
181
182             @Override
183             public EntryTypeName sectionNameToEntryType(String sectionName) {
184                 if (sectionName.equalsIgnoreCase("Synonyme")) {
185                     return EntryTypeName.SYNONYM_MULTI;
186                 }
187                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
188                     return EntryTypeName.ANTONYM_MULTI;
189                 }
190                 return null;
191             }
192
193             @Override
194             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
195                 final String wikiText = wikiTokenizer.wikiLinkText();
196                 return wikiText.startsWith("Kategorie:");
197             }
198             @Override
199             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
200                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
201                     return null;
202                 }
203                 final int hashPos = wikiLinkDest.indexOf("#");
204                 if (hashPos != -1) {
205                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
206                     if (wikiLinkDest.isEmpty()) {
207                         wikiLinkDest = wikiLinkText;
208                     }
209                 }
210                 return wikiLinkDest;
211             }
212
213             @Override
214             public void addFunctionCallbacks(
215                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
216                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
217             }
218         });
219
220         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
221         isoToLangConfig.put("IT", new LangConfig() {
222             @Override
223             public boolean skipSection(String headingText) {
224                 return itSkipSections.matcher(headingText).matches();
225             }
226
227             @Override
228             public EntryTypeName sectionNameToEntryType(String sectionName) {
229                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
230                     return EntryTypeName.SYNONYM_MULTI;
231                 }
232                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
233                     return EntryTypeName.ANTONYM_MULTI;
234                 }
235                 return null;
236             }
237
238             @Override
239             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
240                 final String wikiText = wikiTokenizer.wikiLinkText();
241                 return wikiText.startsWith("Categoria:");
242             }
243             @Override
244             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
245                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
246                     return null;
247                 }
248                 final int hashPos = wikiLinkDest.indexOf("#");
249                 if (hashPos != -1) {
250                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
251                     if (wikiLinkDest.isEmpty()) {
252                         wikiLinkDest = wikiLinkText;
253                     }
254                 }
255                 return wikiLinkDest;
256             }
257
258             @Override
259             public void addFunctionCallbacks(
260                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
261                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
262             }
263         });
264
265
266         final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
267         isoToLangConfig.put("FR", new LangConfig() {
268             @Override
269             public boolean skipSection(String headingText) {
270                 return frSkipSections.matcher(headingText).matches();
271             }
272
273             @Override
274             public EntryTypeName sectionNameToEntryType(String sectionName) {
275                 if (sectionName.equalsIgnoreCase("Synonymes")) {
276                     return EntryTypeName.SYNONYM_MULTI;
277                 }
278                 if (sectionName.equalsIgnoreCase("Antonymes")) {
279                     return EntryTypeName.ANTONYM_MULTI;
280                 }
281                 return null;
282             }
283
284             @Override
285             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
286                 final String wikiText = wikiTokenizer.wikiLinkText();
287                 return wikiText.startsWith("Catégorie:");
288             }
289             @Override
290             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
291                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
292                     return null;
293                 }
294                 final int hashPos = wikiLinkDest.indexOf("#");
295                 if (hashPos != -1) {
296                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
297                     if (wikiLinkDest.isEmpty()) {
298                         wikiLinkDest = wikiLinkText;
299                     }
300                 }
301                 return wikiLinkDest;
302             }
303
304             @Override
305             public void addFunctionCallbacks(
306                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
307                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
308             }
309         });
310     }
311
312     final IndexBuilder titleIndexBuilder;
313     final IndexBuilder defIndexBuilder;
314     final String skipLangIso;
315     final LangConfig langConfig;
316     final String webUrlTemplate;
317
318
319     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
320                                     final String webUrlTemplate) {
321         this.titleIndexBuilder = titleIndexBuilder;
322         this.defIndexBuilder = defIndexBuilder;
323         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
324         this.langConfig = isoToLangConfig.get(wiktionaryIso);
325         this.skipLangIso = skipLangIso;
326         this.webUrlTemplate = webUrlTemplate;
327     }
328
329     IndexedEntry indexedEntry = null;
330
331     @Override
332     public void parseSection(String heading, String text) {
333         assert entrySource != null;
334         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
335         indexedEntry = new IndexedEntry(htmlEntry);
336
337         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
338             this);
339         langConfig.addFunctionCallbacks(callback.functionCallbacks);
340
341         callback.builder = new StringBuilder();
342         callback.indexedEntry = indexedEntry;
343         callback.dispatch(text, null);
344
345         if (webUrlTemplate != null) {
346             final String webUrl = String.format(webUrlTemplate, title);
347             // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
348             try {
349                 callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
350             } catch (Exception e) {
351             }
352         }
353         htmlEntry.html = callback.builder.toString();
354         indexedEntry.isValid = true;
355
356         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
357         tokenData.hasMainEntry = true;
358
359         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
360         tokenData.htmlEntries.add(htmlEntry);
361         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
362         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
363
364         indexedEntry = null;
365     }
366
367     @Override
368     void removeUselessArgs(Map<String, String> namedArgs) {
369     }
370
371     @Override
372     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
373         if (lang == null || lang.equals(skipLangIso)) {
374             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
375         }
376     }
377
378     public static String escapeHtmlLiteral(final String plainText) {
379         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
380         if (StringUtil.isAscii(htmlEscaped)) {
381             return htmlEscaped;
382         } else {
383             return StringUtil.escapeUnicodeToPureHtml(plainText);
384         }
385
386     }
387
388
389
390     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
391         public AppendCallback(WholeSectionToHtmlParser parser) {
392             super(parser);
393         }
394
395         @Override
396         public void onPlainText(String plainText) {
397             super.onPlainText(escapeHtmlLiteral(plainText));
398         }
399
400         @Override
401         public void onWikiLink(WikiTokenizer wikiTokenizer) {
402             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
403                 // Skips wikilinks like: [[en::dick]]
404                 return;
405             }
406             if (langConfig.skipWikiLink(wikiTokenizer)) {
407                 return;
408             }
409             String linkDest;
410             if (wikiTokenizer.wikiLinkDest() != null) {
411                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
412             } else {
413                 linkDest = wikiTokenizer.wikiLinkText();
414             }
415             if (sectionEntryTypeName != null) {
416                 // TODO: inside a definition, this could be the wrong language.
417                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
418             }
419             if (!StringUtil.isNullOrEmpty(linkDest)) {
420                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
421                 super.onWikiLink(wikiTokenizer);
422                 builder.append("</a>");
423             } else {
424                 super.onWikiLink(wikiTokenizer);
425             }
426         }
427
428         @Override
429         public void onFunction(WikiTokenizer wikiTokenizer, String name,
430                                List<String> args, Map<String, String> namedArgs) {
431             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
432                 namedArgs.remove("lang");
433             }
434             super.onFunction(wikiTokenizer, name, args, namedArgs);
435         }
436
437         @Override
438         public void onHtml(WikiTokenizer wikiTokenizer) {
439             super.onHtml(wikiTokenizer);
440         }
441
442         @Override
443         public void onNewline(WikiTokenizer wikiTokenizer) {
444         }
445
446         EntryTypeName sectionEntryTypeName;
447         IndexBuilder currentIndexBuilder;
448
449         @Override
450         public void onHeading(WikiTokenizer wikiTokenizer) {
451             final String headingText = wikiTokenizer.headingWikiText();
452             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
453             final int depth = wikiTokenizer.headingDepth();
454             if (langConfig.skipSection(headingText)) {
455                 //System.out.println("Skipping section:" + headingText);
456                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
457                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
458                         // System.out.println("Resume on: " + wikiTokenizer.token());
459                         wikiTokenizer.returnToLineStart();
460                         return;
461                     } else {
462                         // System.out.println("Skipped: " + wikiTokenizer.token());
463                     }
464                 }
465                 return;
466             }
467             builder.append(String.format("\n<h%d>", depth));
468             dispatch(headingText, null);
469             builder.append(String.format("</h%d>\n", depth));
470         }
471
472         final List<Character> listPrefixStack = new ArrayList<>();
473
474         @Override
475         public void onListItem(WikiTokenizer wikiTokenizer) {
476             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
477                 builder.append("\n");
478             }
479             final String prefix = wikiTokenizer.listItemPrefix();
480             while (listPrefixStack.size() < prefix.length()) {
481                 builder.append(String.format("<%s>",
482                                              WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
483                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
484             }
485             builder.append("<li>");
486             dispatch(wikiTokenizer.listItemWikiText(), null);
487             builder.append("</li>\n");
488
489             WikiTokenizer nextToken = wikiTokenizer.nextToken();
490             boolean returnToLineStart = false;
491             if (nextToken != null && nextToken.isNewline()) {
492                 nextToken = nextToken.nextToken();
493                 returnToLineStart = true;
494             }
495             final String nextListHeader;
496             if (nextToken == null || !nextToken.isListItem()) {
497                 nextListHeader = "";
498             } else {
499                 nextListHeader = nextToken.listItemPrefix();
500             }
501             if (returnToLineStart) {
502                 wikiTokenizer.returnToLineStart();
503             }
504             while (listPrefixStack.size() > nextListHeader.length()) {
505                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
506                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
507             }
508         }
509
510         boolean boldOn = false;
511         boolean italicOn = false;
512
513         @Override
514         public void onMarkup(WikiTokenizer wikiTokenizer) {
515             if ("'''".equals(wikiTokenizer.token())) {
516                 if (!boldOn) {
517                     builder.append("<b>");
518                 } else {
519                     builder.append("</b>");
520                 }
521                 boldOn = !boldOn;
522             } else if ("''".equals(wikiTokenizer.token())) {
523                 if (!italicOn) {
524                     builder.append("<em>");
525                 } else {
526                     builder.append("</em>");
527                 }
528                 italicOn = !italicOn;
529             } else {
530                 assert false;
531             }
532         }
533
534     }
535
536 }