]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Added simple parsing logic for DE and IT wiktionaries.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11 import com.sun.xml.internal.rngom.util.Uri;
12
13 import org.apache.commons.lang3.StringEscapeUtils;
14
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*(Translations|Anagrams|References).*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41             
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too (probably)
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59             
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89         
90         final Pattern deSkipSections = Pattern.compile(".*(Übersetzungen|Referenzen|Quellen).*");
91         isoToLangConfig.put("DE", new LangConfig() {
92             @Override
93             public boolean skipSection(String headingText) {
94                 return deSkipSections.matcher(headingText).matches();
95             }
96             
97             @Override
98             public EntryTypeName sectionNameToEntryType(String sectionName) {
99                 if (sectionName.equalsIgnoreCase("Synonyme")) {
100                     return EntryTypeName.SYNONYM_MULTI;
101                 }
102                 if (sectionName.equalsIgnoreCase("Gegenwörter")) {
103                     return EntryTypeName.ANTONYM_MULTI;
104                 }
105                 return null;
106             }
107             
108             @Override
109             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
110                 final String wikiText = wikiTokenizer.wikiLinkText();
111                 if (wikiText.startsWith("???Category:")) {
112                     return true;
113                 }
114                 return false;
115             }
116             @Override
117             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
118                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
119                     return null;
120                 }
121                 final int hashPos = wikiLinkDest.indexOf("#");
122                 if (hashPos != -1) {
123                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
124                     if (wikiLinkDest.isEmpty()) {
125                         wikiLinkDest = wikiLinkText;
126                     }
127                 }
128                 return wikiLinkDest;
129             }
130
131             @Override
132             public void addFunctionCallbacks(
133                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
134                 DeFunctionCallbacks.addGenericCallbacks(functionCallbacks);
135             }
136         });
137         
138         final Pattern itSkipSections = Pattern.compile(".*(Traduzione|Note / Riferimenti).*");
139         isoToLangConfig.put("IT", new LangConfig() {
140             @Override
141             public boolean skipSection(String headingText) {
142                 return itSkipSections.matcher(headingText).matches();
143             }
144             
145             @Override
146             public EntryTypeName sectionNameToEntryType(String sectionName) {
147                 if (sectionName.equalsIgnoreCase("Sinonimi")) {
148                     return EntryTypeName.SYNONYM_MULTI;
149                 }
150                 if (sectionName.equalsIgnoreCase("Antonimi/Contrari")) {
151                     return EntryTypeName.ANTONYM_MULTI;
152                 }
153                 return null;
154             }
155             
156             @Override
157             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
158                 final String wikiText = wikiTokenizer.wikiLinkText();
159                 if (wikiText.startsWith("???Category:")) {
160                     return true;
161                 }
162                 return false;
163             }
164             @Override
165             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
166                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
167                     return null;
168                 }
169                 final int hashPos = wikiLinkDest.indexOf("#");
170                 if (hashPos != -1) {
171                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
172                     if (wikiLinkDest.isEmpty()) {
173                         wikiLinkDest = wikiLinkText;
174                     }
175                 }
176                 return wikiLinkDest;
177             }
178
179             @Override
180             public void addFunctionCallbacks(
181                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
182                 ItFunctionCallbacks.addGenericCallbacks(functionCallbacks);
183             }
184         });
185
186
187         
188         final LangConfig basicLangConfig = new LangConfig() {
189             @Override
190             public boolean skipSection(String headingText) {
191                 return false;
192             }
193             @Override
194             public EntryTypeName sectionNameToEntryType(String sectionName) {
195                 return EntryTypeName.WIKTIONARY_MENTIONED;
196             }
197             @Override
198             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
199                 final String wikiText = wikiTokenizer.wikiLinkText();
200                 if (wikiText.startsWith("Category:")) {
201                     return true;
202                 }
203                 return false;
204             }
205             @Override
206             public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
207                 return wikiLinkDest;
208             }
209
210             @Override
211             public void addFunctionCallbacks(
212                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
213             }
214         };
215         isoToLangConfig.put("FR", basicLangConfig);
216     }
217
218     final IndexBuilder titleIndexBuilder;
219     final IndexBuilder defIndexBuilder;
220     final String skipLangIso;
221     final LangConfig langConfig;
222     final String webUrlTemplate;
223     
224
225     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
226             final String webUrlTemplate) {
227         this.titleIndexBuilder = titleIndexBuilder;
228         this.defIndexBuilder = defIndexBuilder;
229         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
230         this.langConfig = isoToLangConfig.get(wiktionaryIso);
231         this.skipLangIso = skipLangIso;
232         this.webUrlTemplate = webUrlTemplate;
233     }
234     
235     IndexedEntry indexedEntry = null;
236
237     @Override
238     public void parseSection(String heading, String text) {
239         assert entrySource != null;
240         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
241         indexedEntry = new IndexedEntry(htmlEntry);
242
243         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
244                 this);
245         langConfig.addFunctionCallbacks(callback.functionCallbacks);
246
247         callback.builder = new StringBuilder();
248         callback.indexedEntry = indexedEntry;
249         callback.dispatch(text, null);
250
251         if (webUrlTemplate != null) {
252             final String webUrl = String.format(webUrlTemplate, title);
253             callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", Uri.escapeDisallowedChars(webUrl), escapeHtmlLiteral(webUrl)));
254         }
255         htmlEntry.html = callback.builder.toString();
256         indexedEntry.isValid = true;
257
258         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
259         tokenData.hasMainEntry = true;
260
261         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
262         tokenData.htmlEntries.add(htmlEntry);
263         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
264         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
265         
266         indexedEntry = null;
267     }
268
269     @Override
270     void removeUselessArgs(Map<String, String> namedArgs) {
271     }
272     
273     @Override
274     public void addLinkToCurrentEntry(String token, final String lang, EntryTypeName entryTypeName) {
275         if (lang == null || lang.equals(skipLangIso)) {
276             titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
277         }
278     }
279     
280     public static String escapeHtmlLiteral(final String plainText) {
281         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
282         if (StringUtil.isAscii(htmlEscaped)) {
283             return htmlEscaped;
284         } else { 
285             return StringUtil.escapeToPureHtmlUnicode(plainText);
286         }
287
288     }
289
290
291
292     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
293         public AppendCallback(WholeSectionToHtmlParser parser) {
294             super(parser);
295         }
296
297         @Override
298         public void onPlainText(String plainText) {
299             super.onPlainText(escapeHtmlLiteral(plainText));
300         }
301
302         @Override
303         public void onWikiLink(WikiTokenizer wikiTokenizer) {
304             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
305                 // Skips wikilinks like: [[en::dick]]
306                 return;
307             }
308             if (langConfig.skipWikiLink(wikiTokenizer)) {
309                 return;
310             }
311             String linkDest;
312             if (wikiTokenizer.wikiLinkDest() != null) {
313                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
314             } else {
315                 linkDest = wikiTokenizer.wikiLinkText();
316             }
317             if (sectionEntryTypeName != null) {
318                 // TODO: inside a definition, this could be the wrong language.
319                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
320             }
321             if (!StringUtil.isNullOrEmpty(linkDest)) {
322                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
323                 super.onWikiLink(wikiTokenizer);
324                 builder.append(String.format("</a>"));
325             } else {
326                 super.onWikiLink(wikiTokenizer);
327             }
328         }
329
330         @Override
331         public void onFunction(WikiTokenizer wikiTokenizer, String name,
332                 List<String> args, Map<String, String> namedArgs) {
333             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
334                 namedArgs.remove("lang");
335             }
336             super.onFunction(wikiTokenizer, name, args, namedArgs);
337         }
338
339         @Override
340         public void onHtml(WikiTokenizer wikiTokenizer) {
341             super.onHtml(wikiTokenizer);
342         }
343
344         @Override
345         public void onNewline(WikiTokenizer wikiTokenizer) {
346         }
347         
348         EntryTypeName sectionEntryTypeName;
349         IndexBuilder currentIndexBuilder;
350
351         @Override
352         public void onHeading(WikiTokenizer wikiTokenizer) {
353             final String headingText = wikiTokenizer.headingWikiText();
354             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
355             final int depth = wikiTokenizer.headingDepth();
356             if (langConfig.skipSection(headingText)) {
357                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
358                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
359                         wikiTokenizer.returnToLineStart();
360                         return;
361                     }
362                 }
363                 return;
364             }
365             builder.append(String.format("\n<h%d>", depth));
366             dispatch(headingText, null);
367             builder.append(String.format("</h%d>\n", depth));
368         }
369
370         final List<Character> listPrefixStack = new ArrayList<Character>();
371
372         @Override
373         public void onListItem(WikiTokenizer wikiTokenizer) {
374             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
375                 builder.append("\n");
376             }
377             final String prefix = wikiTokenizer.listItemPrefix();
378             while (listPrefixStack.size() < prefix.length()) {
379                 builder.append(String.format("<%s>",
380                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
381                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
382             }
383             builder.append("<li>");
384             dispatch(wikiTokenizer.listItemWikiText(), null);
385             builder.append("</li>\n");
386
387             WikiTokenizer nextToken = wikiTokenizer.nextToken();
388             boolean returnToLineStart = false;
389             if (nextToken != null && nextToken.isNewline()) {
390                 nextToken = nextToken.nextToken();
391                 returnToLineStart = true;
392             }
393             final String nextListHeader;
394             if (nextToken == null || !nextToken.isListItem()) {
395                 nextListHeader = "";
396             } else {
397                 nextListHeader = nextToken.listItemPrefix();
398             }
399             if (returnToLineStart) {
400                 wikiTokenizer.returnToLineStart();
401             }
402             while (listPrefixStack.size() > nextListHeader.length()) {
403                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
404                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
405             }
406         }
407
408         boolean boldOn = false;
409         boolean italicOn = false;
410
411         @Override
412         public void onMarkup(WikiTokenizer wikiTokenizer) {
413             if ("'''".equals(wikiTokenizer.token())) {
414                 if (!boldOn) {
415                     builder.append("<b>");
416                 } else {
417                     builder.append("</b>");
418                 }
419                 boldOn = !boldOn;
420             } else if ("''".equals(wikiTokenizer.token())) {
421                 if (!italicOn) {
422                     builder.append("<em>");
423                 } else {
424                     builder.append("</em>");
425                 }
426                 italicOn = !italicOn;
427             } else {
428                 assert false;
429             }
430         }
431
432     }
433
434 }