]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
e861b9ddabd3da127833f83b0db3ada33c77f97e
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11
12 import org.apache.commons.lang3.StringEscapeUtils;
13
14 import java.net.URI;
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41             
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too (probably)
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59             
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89         
90         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
91         isoToLangConfig.put("ES", new LangConfig() {
92             @Override
93             public boolean skipSection(String headingText) {
94                 return esSkipSections.matcher(headingText).matches();
95             }
96
97             @Override
98             public EntryTypeName sectionNameToEntryType(String sectionName) {
99                 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
100                     return EntryTypeName.SYNONYM_MULTI;
101                 }
102                 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
103                     return EntryTypeName.ANTONYM_MULTI;
104                 }
105                 return null;
106             }
107
108             @Override
109             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
110                 final String wikiText = wikiTokenizer.wikiLinkText();
111                 if (wikiText.startsWith("Categoría:")) {
112                     return true;
113                 }
114                 return false;
115             }
116             @Override
117             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
118                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
119                     return null;
120                 }
121                 final int hashPos = wikiLinkDest.indexOf("#");
122                 if (hashPos != -1) {
123                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
124                     if (wikiLinkDest.isEmpty()) {
125                         wikiLinkDest = wikiLinkText;
126                     }
127                 }
128                 return wikiLinkDest;
129             }
130
131             @Override
132             public void addFunctionCallbacks(
133                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
134                 // TODO: need Spanish variant
135             }
136         });
137
138         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
139         isoToLangConfig.put("DE", new LangConfig() {
140             @Override
141             public boolean skipSection(String headingText) {
142                 return deSkipSections.matcher(headingText).matches();
143             }
144             
145             @Override
146             public EntryTypeName sectionNameToEntryType(String sectionName) {
147                 if (sectionName.equalsIgnoreCase("Synonyme")) {
148                     return EntryTypeName.SYNONYM_MULTI;
149                 }
150                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
151                     return EntryTypeName.ANTONYM_MULTI;
152                 }
153                 return null;
154             }
155             
156             @Override
157             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
158                 final String wikiText = wikiTokenizer.wikiLinkText();
159                 if (wikiText.startsWith("Kategorie:")) {
160                     return true;
161                 }
162                 return false;
163             }
164             @Override
165             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
166                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
167                     return null;
168                 }
169                 final int hashPos = wikiLinkDest.indexOf("#");
170                 if (hashPos != -1) {
171                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
172                     if (wikiLinkDest.isEmpty()) {
173                         wikiLinkDest = wikiLinkText;
174                     }
175                 }
176                 return wikiLinkDest;
177             }
178
179             @Override
180             public void addFunctionCallbacks(
181                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
182                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
183             }
184         });
185         
186         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
187         isoToLangConfig.put("IT", new LangConfig() {
188             @Override
189             public boolean skipSection(String headingText) {
190                 return itSkipSections.matcher(headingText).matches();
191             }
192             
193             @Override
194             public EntryTypeName sectionNameToEntryType(String sectionName) {
195                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
196                     return EntryTypeName.SYNONYM_MULTI;
197                 }
198                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
199                     return EntryTypeName.ANTONYM_MULTI;
200                 }
201                 return null;
202             }
203             
204             @Override
205             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
206                 final String wikiText = wikiTokenizer.wikiLinkText();
207                 if (wikiText.startsWith("Categoria:")) {
208                     return true;
209                 }
210                 return false;
211             }
212             @Override
213             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
214                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
215                     return null;
216                 }
217                 final int hashPos = wikiLinkDest.indexOf("#");
218                 if (hashPos != -1) {
219                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
220                     if (wikiLinkDest.isEmpty()) {
221                         wikiLinkDest = wikiLinkText;
222                     }
223                 }
224                 return wikiLinkDest;
225             }
226
227             @Override
228             public void addFunctionCallbacks(
229                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
230                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
231             }
232         });
233
234
235         final Pattern frSkipSections = Pattern.compile(".*([Tt]raductions|[Aa]nagrammes).*");
236         isoToLangConfig.put("FR", new LangConfig() {
237             @Override
238             public boolean skipSection(String headingText) {
239                 return frSkipSections.matcher(headingText).matches();
240             }
241             
242             @Override
243             public EntryTypeName sectionNameToEntryType(String sectionName) {
244                 if (sectionName.equalsIgnoreCase("Synonymes")) {
245                     return EntryTypeName.SYNONYM_MULTI;
246                 }
247                 if (sectionName.equalsIgnoreCase("Antonymes")) {
248                     return EntryTypeName.ANTONYM_MULTI;
249                 }
250                 return null;
251             }
252             
253             @Override
254             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
255                 final String wikiText = wikiTokenizer.wikiLinkText();
256                 if (wikiText.startsWith("Catégorie:")) {
257                     return true;
258                 }
259                 return false;
260             }
261             @Override
262             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
263                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
264                     return null;
265                 }
266                 final int hashPos = wikiLinkDest.indexOf("#");
267                 if (hashPos != -1) {
268                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
269                     if (wikiLinkDest.isEmpty()) {
270                         wikiLinkDest = wikiLinkText;
271                     }
272                 }
273                 return wikiLinkDest;
274             }
275
276             @Override
277             public void addFunctionCallbacks(
278                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
279                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
280             }
281         });
282     }
283
284     final IndexBuilder titleIndexBuilder;
285     final IndexBuilder defIndexBuilder;
286     final String skipLangIso;
287     final LangConfig langConfig;
288     final String webUrlTemplate;
289     
290
291     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
292             final String webUrlTemplate) {
293         this.titleIndexBuilder = titleIndexBuilder;
294         this.defIndexBuilder = defIndexBuilder;
295         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
296         this.langConfig = isoToLangConfig.get(wiktionaryIso);
297         this.skipLangIso = skipLangIso;
298         this.webUrlTemplate = webUrlTemplate;
299     }
300     
301     IndexedEntry indexedEntry = null;
302
303     @Override
304     public void parseSection(String heading, String text) {
305         assert entrySource != null;
306         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
307         indexedEntry = new IndexedEntry(htmlEntry);
308
309         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
310                 this);
311         langConfig.addFunctionCallbacks(callback.functionCallbacks);
312
313         callback.builder = new StringBuilder();
314         callback.indexedEntry = indexedEntry;
315         callback.dispatch(text, null);
316
317         if (webUrlTemplate != null) {
318             final String webUrl = String.format(webUrlTemplate, title);
319             // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
320             try {
321             callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toASCIIString(), escapeHtmlLiteral(webUrl)));
322             } catch (Exception e)
323             {}
324         }
325         htmlEntry.html = callback.builder.toString();
326         indexedEntry.isValid = true;
327
328         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
329         tokenData.hasMainEntry = true;
330
331         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
332         tokenData.htmlEntries.add(htmlEntry);
333         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
334         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
335         
336         indexedEntry = null;
337     }
338
339     @Override
340     void removeUselessArgs(Map<String, String> namedArgs) {
341     }
342     
343     @Override
344     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
345         if (lang == null || lang.equals(skipLangIso)) {
346             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
347         }
348     }
349     
350     public static String escapeHtmlLiteral(final String plainText) {
351         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
352         if (StringUtil.isAscii(htmlEscaped)) {
353             return htmlEscaped;
354         } else { 
355             return StringUtil.escapeUnicodeToPureHtml(plainText);
356         }
357
358     }
359
360
361
362     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
363         public AppendCallback(WholeSectionToHtmlParser parser) {
364             super(parser);
365         }
366
367         @Override
368         public void onPlainText(String plainText) {
369             super.onPlainText(escapeHtmlLiteral(plainText));
370         }
371
372         @Override
373         public void onWikiLink(WikiTokenizer wikiTokenizer) {
374             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
375                 // Skips wikilinks like: [[en::dick]]
376                 return;
377             }
378             if (langConfig.skipWikiLink(wikiTokenizer)) {
379                 return;
380             }
381             String linkDest;
382             if (wikiTokenizer.wikiLinkDest() != null) {
383                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
384             } else {
385                 linkDest = wikiTokenizer.wikiLinkText();
386             }
387             if (sectionEntryTypeName != null) {
388                 // TODO: inside a definition, this could be the wrong language.
389                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
390             }
391             if (!StringUtil.isNullOrEmpty(linkDest)) {
392                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
393                 super.onWikiLink(wikiTokenizer);
394                 builder.append(String.format("</a>"));
395             } else {
396                 super.onWikiLink(wikiTokenizer);
397             }
398         }
399
400         @Override
401         public void onFunction(WikiTokenizer wikiTokenizer, String name,
402                 List<String> args, Map<String, String> namedArgs) {
403             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
404                 namedArgs.remove("lang");
405             }
406             super.onFunction(wikiTokenizer, name, args, namedArgs);
407         }
408
409         @Override
410         public void onHtml(WikiTokenizer wikiTokenizer) {
411             super.onHtml(wikiTokenizer);
412         }
413
414         @Override
415         public void onNewline(WikiTokenizer wikiTokenizer) {
416         }
417         
418         EntryTypeName sectionEntryTypeName;
419         IndexBuilder currentIndexBuilder;
420
421         @Override
422         public void onHeading(WikiTokenizer wikiTokenizer) {
423             final String headingText = wikiTokenizer.headingWikiText();
424             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
425             final int depth = wikiTokenizer.headingDepth();
426             if (langConfig.skipSection(headingText)) {
427                 //System.out.println("Skipping section:" + headingText);
428                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
429                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
430                         // System.out.println("Resume on: " + wikiTokenizer.token());
431                         wikiTokenizer.returnToLineStart();
432                         return;
433                     } else {
434                         // System.out.println("Skipped: " + wikiTokenizer.token());
435                     }
436                 }
437                 return;
438             }
439             builder.append(String.format("\n<h%d>", depth));
440             dispatch(headingText, null);
441             builder.append(String.format("</h%d>\n", depth));
442         }
443
444         final List<Character> listPrefixStack = new ArrayList<Character>();
445
446         @Override
447         public void onListItem(WikiTokenizer wikiTokenizer) {
448             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
449                 builder.append("\n");
450             }
451             final String prefix = wikiTokenizer.listItemPrefix();
452             while (listPrefixStack.size() < prefix.length()) {
453                 builder.append(String.format("<%s>",
454                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
455                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
456             }
457             builder.append("<li>");
458             dispatch(wikiTokenizer.listItemWikiText(), null);
459             builder.append("</li>\n");
460
461             WikiTokenizer nextToken = wikiTokenizer.nextToken();
462             boolean returnToLineStart = false;
463             if (nextToken != null && nextToken.isNewline()) {
464                 nextToken = nextToken.nextToken();
465                 returnToLineStart = true;
466             }
467             final String nextListHeader;
468             if (nextToken == null || !nextToken.isListItem()) {
469                 nextListHeader = "";
470             } else {
471                 nextListHeader = nextToken.listItemPrefix();
472             }
473             if (returnToLineStart) {
474                 wikiTokenizer.returnToLineStart();
475             }
476             while (listPrefixStack.size() > nextListHeader.length()) {
477                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
478                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
479             }
480         }
481
482         boolean boldOn = false;
483         boolean italicOn = false;
484
485         @Override
486         public void onMarkup(WikiTokenizer wikiTokenizer) {
487             if ("'''".equals(wikiTokenizer.token())) {
488                 if (!boldOn) {
489                     builder.append("<b>");
490                 } else {
491                     builder.append("</b>");
492                 }
493                 boldOn = !boldOn;
494             } else if ("''".equals(wikiTokenizer.token())) {
495                 if (!italicOn) {
496                     builder.append("<em>");
497                 } else {
498                     builder.append("</em>");
499                 }
500                 italicOn = !italicOn;
501             } else {
502                 assert false;
503             }
504         }
505
506     }
507
508 }