]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Remove dummy code that makes no sense/does not work.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11
12 import org.apache.commons.lang3.StringEscapeUtils;
13
14 import java.net.URI;
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41             
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too (probably)
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59             
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89         
90         final Pattern esSkipSections = Pattern.compile(".*(Traducciones|Locuciones).*");
91         isoToLangConfig.put("ES", new LangConfig() {
92             @Override
93             public boolean skipSection(String headingText) {
94                 return esSkipSections.matcher(headingText).matches();
95             }
96
97             @Override
98             public EntryTypeName sectionNameToEntryType(String sectionName) {
99                 if (sectionName.equalsIgnoreCase("sinónimo") || sectionName.equalsIgnoreCase("sinónimos")) {
100                     return EntryTypeName.SYNONYM_MULTI;
101                 }
102                 if (sectionName.equalsIgnoreCase("antónimo") || sectionName.equalsIgnoreCase("antónimos")) {
103                     return EntryTypeName.ANTONYM_MULTI;
104                 }
105                 return null;
106             }
107
108             @Override
109             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
110                 final String wikiText = wikiTokenizer.wikiLinkText();
111                 if (wikiText.startsWith("Categoría:")) {
112                     return true;
113                 }
114                 return false;
115             }
116             @Override
117             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
118                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
119                     return null;
120                 }
121                 final int hashPos = wikiLinkDest.indexOf("#");
122                 if (hashPos != -1) {
123                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
124                     if (wikiLinkDest.isEmpty()) {
125                         wikiLinkDest = wikiLinkText;
126                     }
127                 }
128                 return wikiLinkDest;
129             }
130
131             @Override
132             public void addFunctionCallbacks(
133                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
134                 // TODO: need Spanish variant
135             }
136         });
137
138         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
139         isoToLangConfig.put("DE", new LangConfig() {
140             @Override
141             public boolean skipSection(String headingText) {
142                 return deSkipSections.matcher(headingText).matches();
143             }
144             
145             @Override
146             public EntryTypeName sectionNameToEntryType(String sectionName) {
147                 if (sectionName.equalsIgnoreCase("Synonyme")) {
148                     return EntryTypeName.SYNONYM_MULTI;
149                 }
150                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
151                     return EntryTypeName.ANTONYM_MULTI;
152                 }
153                 return null;
154             }
155             
156             @Override
157             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
158                 final String wikiText = wikiTokenizer.wikiLinkText();
159                 if (wikiText.startsWith("???Category:")) {
160                     return true;
161                 }
162                 return false;
163             }
164             @Override
165             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
166                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
167                     return null;
168                 }
169                 final int hashPos = wikiLinkDest.indexOf("#");
170                 if (hashPos != -1) {
171                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
172                     if (wikiLinkDest.isEmpty()) {
173                         wikiLinkDest = wikiLinkText;
174                     }
175                 }
176                 return wikiLinkDest;
177             }
178
179             @Override
180             public void addFunctionCallbacks(
181                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
182                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
183             }
184         });
185         
186         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
187         isoToLangConfig.put("IT", new LangConfig() {
188             @Override
189             public boolean skipSection(String headingText) {
190                 return itSkipSections.matcher(headingText).matches();
191             }
192             
193             @Override
194             public EntryTypeName sectionNameToEntryType(String sectionName) {
195                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
196                     return EntryTypeName.SYNONYM_MULTI;
197                 }
198                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
199                     return EntryTypeName.ANTONYM_MULTI;
200                 }
201                 return null;
202             }
203             
204             @Override
205             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
206                 final String wikiText = wikiTokenizer.wikiLinkText();
207                 if (wikiText.startsWith("???Category:")) {
208                     return true;
209                 }
210                 return false;
211             }
212             @Override
213             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
214                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
215                     return null;
216                 }
217                 final int hashPos = wikiLinkDest.indexOf("#");
218                 if (hashPos != -1) {
219                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
220                     if (wikiLinkDest.isEmpty()) {
221                         wikiLinkDest = wikiLinkText;
222                     }
223                 }
224                 return wikiLinkDest;
225             }
226
227             @Override
228             public void addFunctionCallbacks(
229                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
230                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
231             }
232         });
233
234
235         final Pattern frSkipSections = Pattern.compile(".*(Traductions|[Aa]nagrammes).*");
236         isoToLangConfig.put("FR", new LangConfig() {
237             @Override
238             public boolean skipSection(String headingText) {
239                 return frSkipSections.matcher(headingText).matches();
240             }
241             
242             @Override
243             public EntryTypeName sectionNameToEntryType(String sectionName) {
244                 if (sectionName.equalsIgnoreCase("Synonymes")) {
245                     return EntryTypeName.SYNONYM_MULTI;
246                 }
247                 return null;
248             }
249             
250             @Override
251             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
252                 return false;
253             }
254             @Override
255             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
256                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
257                     return null;
258                 }
259                 final int hashPos = wikiLinkDest.indexOf("#");
260                 if (hashPos != -1) {
261                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
262                     if (wikiLinkDest.isEmpty()) {
263                         wikiLinkDest = wikiLinkText;
264                     }
265                 }
266                 return wikiLinkDest;
267             }
268
269             @Override
270             public void addFunctionCallbacks(
271                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
272                 FrFunctionCallbacks.addGenericCallbacks(functionCallbacks);
273             }
274         });
275     }
276
277     final IndexBuilder titleIndexBuilder;
278     final IndexBuilder defIndexBuilder;
279     final String skipLangIso;
280     final LangConfig langConfig;
281     final String webUrlTemplate;
282     
283
284     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
285             final String webUrlTemplate) {
286         this.titleIndexBuilder = titleIndexBuilder;
287         this.defIndexBuilder = defIndexBuilder;
288         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
289         this.langConfig = isoToLangConfig.get(wiktionaryIso);
290         this.skipLangIso = skipLangIso;
291         this.webUrlTemplate = webUrlTemplate;
292     }
293     
294     IndexedEntry indexedEntry = null;
295
296     @Override
297     public void parseSection(String heading, String text) {
298         assert entrySource != null;
299         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
300         indexedEntry = new IndexedEntry(htmlEntry);
301
302         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
303                 this);
304         langConfig.addFunctionCallbacks(callback.functionCallbacks);
305
306         callback.builder = new StringBuilder();
307         callback.indexedEntry = indexedEntry;
308         callback.dispatch(text, null);
309
310         if (webUrlTemplate != null) {
311             final String webUrl = String.format(webUrlTemplate, title);
312             // URI.create can raise an exception e.g. if webUrl contains %, just ignore those cases.
313             try {
314             callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", URI.create(webUrl).toString(), escapeHtmlLiteral(webUrl)));
315             } catch (Exception e)
316             {}
317         }
318         htmlEntry.html = callback.builder.toString();
319         indexedEntry.isValid = true;
320
321         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
322         tokenData.hasMainEntry = true;
323
324         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
325         tokenData.htmlEntries.add(htmlEntry);
326         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
327         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
328         
329         indexedEntry = null;
330     }
331
332     @Override
333     void removeUselessArgs(Map<String, String> namedArgs) {
334     }
335     
336     @Override
337     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
338         if (lang == null || lang.equals(skipLangIso)) {
339             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
340         }
341     }
342     
343     public static String escapeHtmlLiteral(final String plainText) {
344         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
345         if (StringUtil.isAscii(htmlEscaped)) {
346             return htmlEscaped;
347         } else { 
348             return StringUtil.escapeUnicodeToPureHtml(plainText);
349         }
350
351     }
352
353
354
355     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
356         public AppendCallback(WholeSectionToHtmlParser parser) {
357             super(parser);
358         }
359
360         @Override
361         public void onPlainText(String plainText) {
362             super.onPlainText(escapeHtmlLiteral(plainText));
363         }
364
365         @Override
366         public void onWikiLink(WikiTokenizer wikiTokenizer) {
367             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
368                 // Skips wikilinks like: [[en::dick]]
369                 return;
370             }
371             if (langConfig.skipWikiLink(wikiTokenizer)) {
372                 return;
373             }
374             String linkDest;
375             if (wikiTokenizer.wikiLinkDest() != null) {
376                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
377             } else {
378                 linkDest = wikiTokenizer.wikiLinkText();
379             }
380             if (sectionEntryTypeName != null) {
381                 // TODO: inside a definition, this could be the wrong language.
382                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
383             }
384             if (!StringUtil.isNullOrEmpty(linkDest)) {
385                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
386                 super.onWikiLink(wikiTokenizer);
387                 builder.append(String.format("</a>"));
388             } else {
389                 super.onWikiLink(wikiTokenizer);
390             }
391         }
392
393         @Override
394         public void onFunction(WikiTokenizer wikiTokenizer, String name,
395                 List<String> args, Map<String, String> namedArgs) {
396             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
397                 namedArgs.remove("lang");
398             }
399             super.onFunction(wikiTokenizer, name, args, namedArgs);
400         }
401
402         @Override
403         public void onHtml(WikiTokenizer wikiTokenizer) {
404             super.onHtml(wikiTokenizer);
405         }
406
407         @Override
408         public void onNewline(WikiTokenizer wikiTokenizer) {
409         }
410         
411         EntryTypeName sectionEntryTypeName;
412         IndexBuilder currentIndexBuilder;
413
414         @Override
415         public void onHeading(WikiTokenizer wikiTokenizer) {
416             final String headingText = wikiTokenizer.headingWikiText();
417             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
418             final int depth = wikiTokenizer.headingDepth();
419             if (langConfig.skipSection(headingText)) {
420                 //System.out.println("Skipping section:" + headingText);
421                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
422                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
423                         // System.out.println("Resume on: " + wikiTokenizer.token());
424                         wikiTokenizer.returnToLineStart();
425                         return;
426                     } else {
427                         // System.out.println("Skipped: " + wikiTokenizer.token());
428                     }
429                 }
430                 return;
431             }
432             builder.append(String.format("\n<h%d>", depth));
433             dispatch(headingText, null);
434             builder.append(String.format("</h%d>\n", depth));
435         }
436
437         final List<Character> listPrefixStack = new ArrayList<Character>();
438
439         @Override
440         public void onListItem(WikiTokenizer wikiTokenizer) {
441             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
442                 builder.append("\n");
443             }
444             final String prefix = wikiTokenizer.listItemPrefix();
445             while (listPrefixStack.size() < prefix.length()) {
446                 builder.append(String.format("<%s>",
447                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
448                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
449             }
450             builder.append("<li>");
451             dispatch(wikiTokenizer.listItemWikiText(), null);
452             builder.append("</li>\n");
453
454             WikiTokenizer nextToken = wikiTokenizer.nextToken();
455             boolean returnToLineStart = false;
456             if (nextToken != null && nextToken.isNewline()) {
457                 nextToken = nextToken.nextToken();
458                 returnToLineStart = true;
459             }
460             final String nextListHeader;
461             if (nextToken == null || !nextToken.isListItem()) {
462                 nextListHeader = "";
463             } else {
464                 nextListHeader = nextToken.listItemPrefix();
465             }
466             if (returnToLineStart) {
467                 wikiTokenizer.returnToLineStart();
468             }
469             while (listPrefixStack.size() > nextListHeader.length()) {
470                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
471                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
472             }
473         }
474
475         boolean boldOn = false;
476         boolean italicOn = false;
477
478         @Override
479         public void onMarkup(WikiTokenizer wikiTokenizer) {
480             if ("'''".equals(wikiTokenizer.token())) {
481                 if (!boldOn) {
482                     builder.append("<b>");
483                 } else {
484                     builder.append("</b>");
485                 }
486                 boldOn = !boldOn;
487             } else if ("''".equals(wikiTokenizer.token())) {
488                 if (!italicOn) {
489                     builder.append("<em>");
490                 } else {
491                     builder.append("</em>");
492                 }
493                 italicOn = !italicOn;
494             } else {
495                 assert false;
496             }
497         }
498
499     }
500
501 }