]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Fix compilation.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import java.net.URI;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.regex.Pattern;
10
11 import org.apache.commons.text.StringEscapeUtils;
12
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31             Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 // We need to put it in the other index, too (probably) ?
51                 // EnParser.partOfSpeechHeader.matcher(sectionName).matches()
52
53                 // Needs special handling?
54                 // sectionName.equalsIgnoreCase("Derived Terms")
55                 return null;
56             }
57
58             @Override
59             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
60                 final String wikiText = wikiTokenizer.wikiLinkText();
61                 return wikiText.startsWith("Category:");
62             }
63             @Override
64             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
65                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
66                     return null;
67                 }
68                 final int hashPos = wikiLinkDest.indexOf("#");
69                 if (hashPos != -1) {
70                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
71                     if (wikiLinkDest.isEmpty()) {
72                         wikiLinkDest = wikiLinkText;
73                     }
74                 }
75                 return wikiLinkDest;
76             }
77
78             @Override
79             public void addFunctionCallbacks(
80                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
81                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
82             }
83         });
84
85         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
86         isoToLangConfig.put("ES", new LangConfig() {
87             @Override
88             public boolean skipSection(String headingText) {
89                 return esSkipSections.matcher(headingText).matches();
90             }
91
92             @Override
93             public EntryTypeName sectionNameToEntryType(String sectionName) {
94                 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
95                     return EntryTypeName.SYNONYM_MULTI;
96                 }
97                 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
98                     return EntryTypeName.ANTONYM_MULTI;
99                 }
100                 return null;
101             }
102
103             @Override
104             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
105                 final String wikiText = wikiTokenizer.wikiLinkText();
106                 return wikiText.startsWith("Categoría:");
107             }
108             @Override
109             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
110                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
111                     return null;
112                 }
113                 final int hashPos = wikiLinkDest.indexOf("#");
114                 if (hashPos != -1) {
115                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
116                     if (wikiLinkDest.isEmpty()) {
117                         wikiLinkDest = wikiLinkText;
118                     }
119                 }
120                 return wikiLinkDest;
121             }
122
123             @Override
124             public void addFunctionCallbacks(
125                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
126                 // TODO: need Spanish variant
127             }
128         });
129
130         final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
131         isoToLangConfig.put("PT", new LangConfig() {
132             @Override
133             public boolean skipSection(String headingText) {
134                 return esSkipSections.matcher(headingText).matches();
135             }
136
137             @Override
138             public EntryTypeName sectionNameToEntryType(String sectionName) {
139                 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
140                     return EntryTypeName.SYNONYM_MULTI;
141                 }
142                 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
143                     return EntryTypeName.ANTONYM_MULTI;
144                 }
145                 return null;
146             }
147
148             @Override
149             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
150                 final String wikiText = wikiTokenizer.wikiLinkText();
151                 return wikiText.startsWith("Categoria:");
152             }
153             @Override
154             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
155                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
156                     return null;
157                 }
158                 final int hashPos = wikiLinkDest.indexOf("#");
159                 if (hashPos != -1) {
160                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
161                     if (wikiLinkDest.isEmpty()) {
162                         wikiLinkDest = wikiLinkText;
163                     }
164                 }
165                 return wikiLinkDest;
166             }
167
168             @Override
169             public void addFunctionCallbacks(
170                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
171                 // TODO: need Portuguese variant
172             }
173         });
174
175         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
176         isoToLangConfig.put("DE", new LangConfig() {
177             @Override
178             public boolean skipSection(String headingText) {
179                 return deSkipSections.matcher(headingText).matches();
180             }
181
182             @Override
183             public EntryTypeName sectionNameToEntryType(String sectionName) {
184                 if (sectionName.equalsIgnoreCase("Synonyme")) {
185                     return EntryTypeName.SYNONYM_MULTI;
186                 }
187                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
188                     return EntryTypeName.ANTONYM_MULTI;
189                 }
190                 return null;
191             }
192
193             @Override
194             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
195                 final String wikiText = wikiTokenizer.wikiLinkText();
196                 return wikiText.startsWith("Kategorie:");
197             }
198             @Override
199             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
200                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
201                     return null;
202                 }
203                 final int hashPos = wikiLinkDest.indexOf("#");
204                 if (hashPos != -1) {
205                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
206                     if (wikiLinkDest.isEmpty()) {
207                         wikiLinkDest = wikiLinkText;
208                     }
209                 }
210                 return wikiLinkDest;
211             }
212
213             @Override
214             public void addFunctionCallbacks(
215                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
216                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
217             }
218         });
219
220         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
221         isoToLangConfig.put("IT", new LangConfig() {
222             @Override
223             public boolean skipSection(String headingText) {
224                 return itSkipSections.matcher(headingText).matches();
225             }
226
227             @Override
228             public EntryTypeName sectionNameToEntryType(String sectionName) {
229                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
230                     return EntryTypeName.SYNONYM_MULTI;
231                 }
232                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
233                     return EntryTypeName.ANTONYM_MULTI;
234                 }
235                 return null;
236             }
237
238             @Override
239             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
240                 final String wikiText = wikiTokenizer.wikiLinkText();
241                 return wikiText.startsWith("Categoria:");
242             }
243             @Override
244             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
245                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
246                     return null;
247                 }
248                 final int hashPos = wikiLinkDest.indexOf("#");
249                 if (hashPos != -1) {
250                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
251                     if (wikiLinkDest.isEmpty()) {
252                         wikiLinkDest = wikiLinkText;
253                     }
254                 }
255                 return wikiLinkDest;
256             }
257
258             @Override
259             public void addFunctionCallbacks(
260                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
261                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
262             }
263         });
264
265
266         final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
267         isoToLangConfig.put("FR", new LangConfig() {
268             @Override
269             public boolean skipSection(String headingText) {
270                 return frSkipSections.matcher(headingText).matches();
271             }
272
273             @Override
274             public EntryTypeName sectionNameToEntryType(String sectionName) {
275                 if (sectionName.equalsIgnoreCase("Synonymes")) {
276                     return EntryTypeName.SYNONYM_MULTI;
277                 }
278                 if (sectionName.equalsIgnoreCase("Antonymes")) {
279                     return EntryTypeName.ANTONYM_MULTI;
280                 }
281                 return null;
282             }
283
284             @Override
285             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
286                 final String wikiText = wikiTokenizer.wikiLinkText();
287                 return wikiText.startsWith("Catégorie:");
288             }
289             @Override
290             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
291                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
292                     return null;
293                 }
294                 final int hashPos = wikiLinkDest.indexOf("#");
295                 if (hashPos != -1) {
296                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
297                     if (wikiLinkDest.isEmpty()) {
298                         wikiLinkDest = wikiLinkText;
299                     }
300                 }
301                 return wikiLinkDest;
302             }
303
304             @Override
305             public void addFunctionCallbacks(
306                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
307                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
308             }
309         });
310     }
311
312     final IndexBuilder titleIndexBuilder;
313     final IndexBuilder defIndexBuilder;
314     final String skipLangIso;
315     final LangConfig langConfig;
316     final String webUrlTemplate;
317
318
319     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
320                                     final String webUrlTemplate) {
321         this.titleIndexBuilder = titleIndexBuilder;
322         this.defIndexBuilder = defIndexBuilder;
323         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
324         this.langConfig = isoToLangConfig.get(wiktionaryIso);
325         this.skipLangIso = skipLangIso;
326         this.webUrlTemplate = webUrlTemplate;
327     }
328
329     IndexedEntry indexedEntry = null;
330
331     @Override
332     public void parseSection(String heading, String text) {
333         assert entrySource != null;
334         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
335         indexedEntry = new IndexedEntry(htmlEntry);
336
337         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
338             this);
339         langConfig.addFunctionCallbacks(callback.functionCallbacks);
340
341         callback.builder = new StringBuilder();
342         callback.indexedEntry = indexedEntry;
343         callback.dispatch(text, null);
344
345         if (webUrlTemplate != null) {
346             final String webUrl = String.format(webUrlTemplate, title);
347             String asciiWebUrl = null;
348             // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
349             try {
350                 asciiWebUrl = URI.create(webUrl).toASCIIString();
351             } catch (Exception e) {
352             }
353             if (asciiWebUrl != null) {
354                 callback.builder.append("<p> <a href=\"");
355                 callback.builder.append(asciiWebUrl);
356                 callback.builder.append("\">");
357                 callback.builder.append(escapeHtmlLiteral(webUrl));
358                 callback.builder.append("</a>");
359             }
360         }
361         htmlEntry.html = callback.builder.toString();
362         indexedEntry.isValid = true;
363
364         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
365         tokenData.hasMainEntry = true;
366
367         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
368         tokenData.htmlEntries.add(htmlEntry);
369         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
370         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
371
372         indexedEntry = null;
373     }
374
375     @Override
376     void removeUselessArgs(Map<String, String> namedArgs) {
377     }
378
379     @Override
380     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
381         if (lang == null || lang.equals(skipLangIso)) {
382             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
383         }
384     }
385
386     public static String escapeHtmlLiteral(final String plainText) {
387         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
388         if (StringUtil.isAscii(htmlEscaped)) {
389             return htmlEscaped;
390         } else {
391             return StringUtil.escapeUnicodeToPureHtml(plainText);
392         }
393
394     }
395
396
397
398     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
399         public AppendCallback(WholeSectionToHtmlParser parser) {
400             super(parser);
401         }
402
403         @Override
404         public void onPlainText(String plainText) {
405             super.onPlainText(escapeHtmlLiteral(plainText));
406         }
407
408         @Override
409         public void onWikiLink(WikiTokenizer wikiTokenizer) {
410             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
411                 // Skips wikilinks like: [[en::dick]]
412                 return;
413             }
414             if (langConfig.skipWikiLink(wikiTokenizer)) {
415                 return;
416             }
417             String linkDest;
418             if (wikiTokenizer.wikiLinkDest() != null) {
419                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
420             } else {
421                 linkDest = wikiTokenizer.wikiLinkText();
422             }
423             if (sectionEntryTypeName != null) {
424                 // TODO: inside a definition, this could be the wrong language.
425                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
426             }
427             if (!StringUtil.isNullOrEmpty(linkDest)) {
428                 builder.append("<a href=\"");
429                 builder.append(HtmlEntry.formatQuickdicUrl("", linkDest));
430                 builder.append("\">");
431                 super.onWikiLink(wikiTokenizer);
432                 builder.append("</a>");
433             } else {
434                 super.onWikiLink(wikiTokenizer);
435             }
436         }
437
438         @Override
439         public void onFunction(WikiTokenizer wikiTokenizer, String name,
440                                List<String> args, Map<String, String> namedArgs) {
441             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
442                 namedArgs.remove("lang");
443             }
444             super.onFunction(wikiTokenizer, name, args, namedArgs);
445         }
446
447         @Override
448         public void onHtml(WikiTokenizer wikiTokenizer) {
449             super.onHtml(wikiTokenizer);
450         }
451
452         @Override
453         public void onNewline(WikiTokenizer wikiTokenizer) {
454         }
455
456         EntryTypeName sectionEntryTypeName;
457         IndexBuilder currentIndexBuilder;
458
459         @Override
460         public void onHeading(WikiTokenizer wikiTokenizer) {
461             final String headingText = wikiTokenizer.headingWikiText();
462             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
463             final int depth = wikiTokenizer.headingDepth();
464             if (langConfig.skipSection(headingText)) {
465                 //System.out.println("Skipping section:" + headingText);
466                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
467                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
468                         // System.out.println("Resume on: " + wikiTokenizer.token());
469                         wikiTokenizer.returnToLineStart();
470                         return;
471                     } else {
472                         // System.out.println("Skipped: " + wikiTokenizer.token());
473                     }
474                 }
475                 return;
476             }
477             builder.append("\n<h");
478             builder.append(depth);
479             builder.append('>');
480             dispatch(headingText, null);
481             builder.append("</h");
482             builder.append(depth);
483             builder.append(">\n");
484         }
485
486         final List<Character> listPrefixStack = new ArrayList<>();
487
488         @Override
489         public void onListItem(WikiTokenizer wikiTokenizer) {
490             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
491                 builder.append("\n");
492             }
493             final String prefix = wikiTokenizer.listItemPrefix();
494             while (listPrefixStack.size() < prefix.length()) {
495                 builder.append('<');
496                 builder.append(WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size())));
497                 builder.append('>');
498                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
499             }
500             builder.append("<li>");
501             dispatch(wikiTokenizer.listItemWikiText(), null);
502             builder.append("</li>\n");
503
504             WikiTokenizer nextToken = wikiTokenizer.nextToken();
505             boolean returnToLineStart = false;
506             if (nextToken != null && nextToken.isNewline()) {
507                 nextToken = nextToken.nextToken();
508                 returnToLineStart = true;
509             }
510             final String nextListHeader;
511             if (nextToken == null || !nextToken.isListItem()) {
512                 nextListHeader = "";
513             } else {
514                 nextListHeader = nextToken.listItemPrefix();
515             }
516             if (returnToLineStart) {
517                 wikiTokenizer.returnToLineStart();
518             }
519             while (listPrefixStack.size() > nextListHeader.length()) {
520                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
521                 builder.append("</");
522                 builder.append(WikiTokenizer.getListTag(prefixChar));
523                 builder.append(">\n");
524             }
525         }
526
527         boolean boldOn = false;
528         boolean italicOn = false;
529
530         @Override
531         public void onMarkup(WikiTokenizer wikiTokenizer) {
532             if ("'''".equals(wikiTokenizer.token())) {
533                 if (!boldOn) {
534                     builder.append("<b>");
535                 } else {
536                     builder.append("</b>");
537                 }
538                 boldOn = !boldOn;
539             } else if ("''".equals(wikiTokenizer.token())) {
540                 if (!italicOn) {
541                     builder.append("<em>");
542                 } else {
543                     builder.append("</em>");
544                 }
545                 italicOn = !italicOn;
546             } else {
547                 assert false;
548             }
549         }
550
551     }
552
553 }