]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
e0763620b7f1289bc01bee7da86d850001417d2c
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import java.net.URI;
5 import java.util.ArrayList;
6 import java.util.LinkedHashMap;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.regex.Pattern;
10
11 import org.apache.commons.text.StringEscapeUtils;
12
13 import com.hughes.android.dictionary.engine.EntryTypeName;
14 import com.hughes.android.dictionary.engine.HtmlEntry;
15 import com.hughes.android.dictionary.engine.IndexBuilder;
16 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
17 import com.hughes.android.dictionary.engine.IndexedEntry;
18 import com.hughes.android.dictionary.parser.WikiTokenizer;
19 import com.hughes.util.StringUtil;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31             Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too (probably)
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89
90         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
91         isoToLangConfig.put("ES", new LangConfig() {
92             @Override
93             public boolean skipSection(String headingText) {
94                 return esSkipSections.matcher(headingText).matches();
95             }
96
97             @Override
98             public EntryTypeName sectionNameToEntryType(String sectionName) {
99                 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
100                     return EntryTypeName.SYNONYM_MULTI;
101                 }
102                 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
103                     return EntryTypeName.ANTONYM_MULTI;
104                 }
105                 return null;
106             }
107
108             @Override
109             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
110                 final String wikiText = wikiTokenizer.wikiLinkText();
111                 if (wikiText.startsWith("Categoría:")) {
112                     return true;
113                 }
114                 return false;
115             }
116             @Override
117             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
118                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
119                     return null;
120                 }
121                 final int hashPos = wikiLinkDest.indexOf("#");
122                 if (hashPos != -1) {
123                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
124                     if (wikiLinkDest.isEmpty()) {
125                         wikiLinkDest = wikiLinkText;
126                     }
127                 }
128                 return wikiLinkDest;
129             }
130
131             @Override
132             public void addFunctionCallbacks(
133                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
134                 // TODO: need Spanish variant
135             }
136         });
137
138         final Pattern ptSkipSections = Pattern.compile(".*Tradução.*");
139         isoToLangConfig.put("PT", new LangConfig() {
140             @Override
141             public boolean skipSection(String headingText) {
142                 return esSkipSections.matcher(headingText).matches();
143             }
144
145             @Override
146             public EntryTypeName sectionNameToEntryType(String sectionName) {
147                 if (sectionName.equalsIgnoreCase("Sinônimo") || sectionName.equalsIgnoreCase("Sinônimos")) {
148                     return EntryTypeName.SYNONYM_MULTI;
149                 }
150                 if (sectionName.equalsIgnoreCase("Antônimo") || sectionName.equalsIgnoreCase("Antônimos")) {
151                     return EntryTypeName.ANTONYM_MULTI;
152                 }
153                 return null;
154             }
155
156             @Override
157             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
158                 final String wikiText = wikiTokenizer.wikiLinkText();
159                 if (wikiText.startsWith("Categoria:")) {
160                     return true;
161                 }
162                 return false;
163             }
164             @Override
165             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
166                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
167                     return null;
168                 }
169                 final int hashPos = wikiLinkDest.indexOf("#");
170                 if (hashPos != -1) {
171                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
172                     if (wikiLinkDest.isEmpty()) {
173                         wikiLinkDest = wikiLinkText;
174                     }
175                 }
176                 return wikiLinkDest;
177             }
178
179             @Override
180             public void addFunctionCallbacks(
181                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
182                 // TODO: need Portuguese variant
183             }
184         });
185
186         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
187         isoToLangConfig.put("DE", new LangConfig() {
188             @Override
189             public boolean skipSection(String headingText) {
190                 return deSkipSections.matcher(headingText).matches();
191             }
192
193             @Override
194             public EntryTypeName sectionNameToEntryType(String sectionName) {
195                 if (sectionName.equalsIgnoreCase("Synonyme")) {
196                     return EntryTypeName.SYNONYM_MULTI;
197                 }
198                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
199                     return EntryTypeName.ANTONYM_MULTI;
200                 }
201                 return null;
202             }
203
204             @Override
205             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
206                 final String wikiText = wikiTokenizer.wikiLinkText();
207                 if (wikiText.startsWith("Kategorie:")) {
208                     return true;
209                 }
210                 return false;
211             }
212             @Override
213             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
214                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
215                     return null;
216                 }
217                 final int hashPos = wikiLinkDest.indexOf("#");
218                 if (hashPos != -1) {
219                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
220                     if (wikiLinkDest.isEmpty()) {
221                         wikiLinkDest = wikiLinkText;
222                     }
223                 }
224                 return wikiLinkDest;
225             }
226
227             @Override
228             public void addFunctionCallbacks(
229                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
230                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
231             }
232         });
233
234         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
235         isoToLangConfig.put("IT", new LangConfig() {
236             @Override
237             public boolean skipSection(String headingText) {
238                 return itSkipSections.matcher(headingText).matches();
239             }
240
241             @Override
242             public EntryTypeName sectionNameToEntryType(String sectionName) {
243                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
244                     return EntryTypeName.SYNONYM_MULTI;
245                 }
246                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
247                     return EntryTypeName.ANTONYM_MULTI;
248                 }
249                 return null;
250             }
251
252             @Override
253             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
254                 final String wikiText = wikiTokenizer.wikiLinkText();
255                 if (wikiText.startsWith("Categoria:")) {
256                     return true;
257                 }
258                 return false;
259             }
260             @Override
261             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
262                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
263                     return null;
264                 }
265                 final int hashPos = wikiLinkDest.indexOf("#");
266                 if (hashPos != -1) {
267                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
268                     if (wikiLinkDest.isEmpty()) {
269                         wikiLinkDest = wikiLinkText;
270                     }
271                 }
272                 return wikiLinkDest;
273             }
274
275             @Override
276             public void addFunctionCallbacks(
277                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
278                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
279             }
280         });
281
282
283         final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
284         isoToLangConfig.put("FR", new LangConfig() {
285             @Override
286             public boolean skipSection(String headingText) {
287                 return frSkipSections.matcher(headingText).matches();
288             }
289
290             @Override
291             public EntryTypeName sectionNameToEntryType(String sectionName) {
292                 if (sectionName.equalsIgnoreCase("Synonymes")) {
293                     return EntryTypeName.SYNONYM_MULTI;
294                 }
295                 if (sectionName.equalsIgnoreCase("Antonymes")) {
296                     return EntryTypeName.ANTONYM_MULTI;
297                 }
298                 return null;
299             }
300
301             @Override
302             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
303                 final String wikiText = wikiTokenizer.wikiLinkText();
304                 if (wikiText.startsWith("Catégorie:")) {
305                     return true;
306                 }
307                 return false;
308             }
309             @Override
310             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
311                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
312                     return null;
313                 }
314                 final int hashPos = wikiLinkDest.indexOf("#");
315                 if (hashPos != -1) {
316                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
317                     if (wikiLinkDest.isEmpty()) {
318                         wikiLinkDest = wikiLinkText;
319                     }
320                 }
321                 return wikiLinkDest;
322             }
323
324             @Override
325             public void addFunctionCallbacks(
326                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
327                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
328             }
329         });
330     }
331
332     final IndexBuilder titleIndexBuilder;
333     final IndexBuilder defIndexBuilder;
334     final String skipLangIso;
335     final LangConfig langConfig;
336     final String webUrlTemplate;
337
338
339     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
340                                     final String webUrlTemplate) {
341         this.titleIndexBuilder = titleIndexBuilder;
342         this.defIndexBuilder = defIndexBuilder;
343         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
344         this.langConfig = isoToLangConfig.get(wiktionaryIso);
345         this.skipLangIso = skipLangIso;
346         this.webUrlTemplate = webUrlTemplate;
347     }
348
349     IndexedEntry indexedEntry = null;
350
351     @Override
352     public void parseSection(String heading, String text) {
353         assert entrySource != null;
354         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
355         indexedEntry = new IndexedEntry(htmlEntry);
356
357         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
358             this);
359         langConfig.addFunctionCallbacks(callback.functionCallbacks);
360
361         callback.builder = new StringBuilder();
362         callback.indexedEntry = indexedEntry;
363         callback.dispatch(text, null);
364
365         if (webUrlTemplate != null) {
366             final String webUrl = String.format(webUrlTemplate, title);
367             // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
368             try {
369                 callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
370             } catch (Exception e) {
371             }
372         }
373         htmlEntry.html = callback.builder.toString();
374         indexedEntry.isValid = true;
375
376         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
377         tokenData.hasMainEntry = true;
378
379         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
380         tokenData.htmlEntries.add(htmlEntry);
381         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
382         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
383
384         indexedEntry = null;
385     }
386
387     @Override
388     void removeUselessArgs(Map<String, String> namedArgs) {
389     }
390
391     @Override
392     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
393         if (lang == null || lang.equals(skipLangIso)) {
394             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
395         }
396     }
397
398     public static String escapeHtmlLiteral(final String plainText) {
399         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
400         if (StringUtil.isAscii(htmlEscaped)) {
401             return htmlEscaped;
402         } else {
403             return StringUtil.escapeUnicodeToPureHtml(plainText);
404         }
405
406     }
407
408
409
410     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
411         public AppendCallback(WholeSectionToHtmlParser parser) {
412             super(parser);
413         }
414
415         @Override
416         public void onPlainText(String plainText) {
417             super.onPlainText(escapeHtmlLiteral(plainText));
418         }
419
420         @Override
421         public void onWikiLink(WikiTokenizer wikiTokenizer) {
422             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
423                 // Skips wikilinks like: [[en::dick]]
424                 return;
425             }
426             if (langConfig.skipWikiLink(wikiTokenizer)) {
427                 return;
428             }
429             String linkDest;
430             if (wikiTokenizer.wikiLinkDest() != null) {
431                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
432             } else {
433                 linkDest = wikiTokenizer.wikiLinkText();
434             }
435             if (sectionEntryTypeName != null) {
436                 // TODO: inside a definition, this could be the wrong language.
437                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
438             }
439             if (!StringUtil.isNullOrEmpty(linkDest)) {
440                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
441                 super.onWikiLink(wikiTokenizer);
442                 builder.append(String.format("</a>"));
443             } else {
444                 super.onWikiLink(wikiTokenizer);
445             }
446         }
447
448         @Override
449         public void onFunction(WikiTokenizer wikiTokenizer, String name,
450                                List<String> args, Map<String, String> namedArgs) {
451             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
452                 namedArgs.remove("lang");
453             }
454             super.onFunction(wikiTokenizer, name, args, namedArgs);
455         }
456
457         @Override
458         public void onHtml(WikiTokenizer wikiTokenizer) {
459             super.onHtml(wikiTokenizer);
460         }
461
462         @Override
463         public void onNewline(WikiTokenizer wikiTokenizer) {
464         }
465
466         EntryTypeName sectionEntryTypeName;
467         IndexBuilder currentIndexBuilder;
468
469         @Override
470         public void onHeading(WikiTokenizer wikiTokenizer) {
471             final String headingText = wikiTokenizer.headingWikiText();
472             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
473             final int depth = wikiTokenizer.headingDepth();
474             if (langConfig.skipSection(headingText)) {
475                 //System.out.println("Skipping section:" + headingText);
476                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
477                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
478                         // System.out.println("Resume on: " + wikiTokenizer.token());
479                         wikiTokenizer.returnToLineStart();
480                         return;
481                     } else {
482                         // System.out.println("Skipped: " + wikiTokenizer.token());
483                     }
484                 }
485                 return;
486             }
487             builder.append(String.format("\n<h%d>", depth));
488             dispatch(headingText, null);
489             builder.append(String.format("</h%d>\n", depth));
490         }
491
492         final List<Character> listPrefixStack = new ArrayList<Character>();
493
494         @Override
495         public void onListItem(WikiTokenizer wikiTokenizer) {
496             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
497                 builder.append("\n");
498             }
499             final String prefix = wikiTokenizer.listItemPrefix();
500             while (listPrefixStack.size() < prefix.length()) {
501                 builder.append(String.format("<%s>",
502                                              WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
503                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
504             }
505             builder.append("<li>");
506             dispatch(wikiTokenizer.listItemWikiText(), null);
507             builder.append("</li>\n");
508
509             WikiTokenizer nextToken = wikiTokenizer.nextToken();
510             boolean returnToLineStart = false;
511             if (nextToken != null && nextToken.isNewline()) {
512                 nextToken = nextToken.nextToken();
513                 returnToLineStart = true;
514             }
515             final String nextListHeader;
516             if (nextToken == null || !nextToken.isListItem()) {
517                 nextListHeader = "";
518             } else {
519                 nextListHeader = nextToken.listItemPrefix();
520             }
521             if (returnToLineStart) {
522                 wikiTokenizer.returnToLineStart();
523             }
524             while (listPrefixStack.size() > nextListHeader.length()) {
525                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
526                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
527             }
528         }
529
530         boolean boldOn = false;
531         boolean italicOn = false;
532
533         @Override
534         public void onMarkup(WikiTokenizer wikiTokenizer) {
535             if ("'''".equals(wikiTokenizer.token())) {
536                 if (!boldOn) {
537                     builder.append("<b>");
538                 } else {
539                     builder.append("</b>");
540                 }
541                 boldOn = !boldOn;
542             } else if ("''".equals(wikiTokenizer.token())) {
543                 if (!italicOn) {
544                     builder.append("<em>");
545                 } else {
546                     builder.append("</em>");
547                 }
548                 italicOn = !italicOn;
549             } else {
550                 assert false;
551             }
552         }
553
554     }
555
556 }