]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Fixed trailing ,s in italian verb tenses.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11 import com.sun.xml.internal.rngom.util.Uri;
12
13 import org.apache.commons.lang3.StringEscapeUtils;
14
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41             
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too.
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59             
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89         
90         final LangConfig basicLangConfig = new LangConfig() {
91             @Override
92             public boolean skipSection(String headingText) {
93                 return false;
94             }
95             @Override
96             public EntryTypeName sectionNameToEntryType(String sectionName) {
97                 return EntryTypeName.WIKTIONARY_MENTIONED;
98             }
99             @Override
100             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
101                 final String wikiText = wikiTokenizer.wikiLinkText();
102                 if (wikiText.startsWith("Category:")) {
103                     return true;
104                 }
105                 return false;
106             }
107             @Override
108             public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
109                 return wikiLinkDest;
110             }
111
112             @Override
113             public void addFunctionCallbacks(
114                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
115             }
116         };
117         isoToLangConfig.put("FR", basicLangConfig);
118         isoToLangConfig.put("DE", basicLangConfig);
119         isoToLangConfig.put("IT", basicLangConfig);
120     }
121
122     final IndexBuilder titleIndexBuilder;
123     final IndexBuilder defIndexBuilder;
124     final String skipLangIso;
125     final LangConfig langConfig;
126     final String webUrlTemplate;
127     
128
129     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso,
130             final String webUrlTemplate) {
131         this.titleIndexBuilder = titleIndexBuilder;
132         this.defIndexBuilder = defIndexBuilder;
133         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
134         this.langConfig = isoToLangConfig.get(wiktionaryIso);
135         this.skipLangIso = skipLangIso;
136         this.webUrlTemplate = webUrlTemplate;
137     }
138     
139     IndexedEntry indexedEntry = null;
140
141     @Override
142     public void parseSection(String heading, String text) {
143         assert entrySource != null;
144         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
145         indexedEntry = new IndexedEntry(htmlEntry);
146
147         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
148                 this);
149         langConfig.addFunctionCallbacks(callback.functionCallbacks);
150
151         callback.builder = new StringBuilder();
152         callback.indexedEntry = indexedEntry;
153         callback.dispatch(text, null);
154
155         if (webUrlTemplate != null) {
156             final String webUrl = String.format(webUrlTemplate, title);
157             callback.builder.append(String.format("<p> <a href=\"%s\">%s</a>", Uri.escapeDisallowedChars(webUrl), escapeHtmlLiteral(webUrl)));
158         }
159         htmlEntry.html = callback.builder.toString();
160         indexedEntry.isValid = true;
161
162         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
163
164         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
165         tokenData.htmlEntries.add(htmlEntry);
166         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
167         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
168         
169         indexedEntry = null;
170     }
171
172     @Override
173     void removeUselessArgs(Map<String, String> namedArgs) {
174     }
175     
176     @Override
177     public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
178         titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
179     }
180     
181     public static String escapeHtmlLiteral(final String plainText) {
182         final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
183         if (StringUtil.isAscii(htmlEscaped)) {
184             return htmlEscaped;
185         } else { 
186             return StringUtil.escapeToPureHtmlUnicode(plainText);
187         }
188
189     }
190
191
192
193     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
194         public AppendCallback(WholeSectionToHtmlParser parser) {
195             super(parser);
196         }
197
198         @Override
199         public void onPlainText(String plainText) {
200             super.onPlainText(escapeHtmlLiteral(plainText));
201         }
202
203         @Override
204         public void onWikiLink(WikiTokenizer wikiTokenizer) {
205             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
206                 // Skips wikilinks like: [[en::dick]]
207                 return;
208             }
209             if (langConfig.skipWikiLink(wikiTokenizer)) {
210                 return;
211             }
212             String linkDest;
213             if (wikiTokenizer.wikiLinkDest() != null) {
214                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
215             } else {
216                 linkDest = wikiTokenizer.wikiLinkText();
217             }
218             if (sectionEntryTypeName != null) {
219                 // TODO: inside a definition, this could be the wrong language.
220                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
221             }
222             if (linkDest != null) {
223                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
224                 super.onWikiLink(wikiTokenizer);
225                 builder.append(String.format("</a>"));
226             } else {
227                 super.onWikiLink(wikiTokenizer);
228             }
229         }
230
231         @Override
232         public void onFunction(WikiTokenizer wikiTokenizer, String name,
233                 List<String> args, Map<String, String> namedArgs) {
234             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
235                 namedArgs.remove("lang");
236             }
237             super.onFunction(wikiTokenizer, name, args, namedArgs);
238         }
239
240         @Override
241         public void onHtml(WikiTokenizer wikiTokenizer) {
242             super.onHtml(wikiTokenizer);
243         }
244
245         @Override
246         public void onNewline(WikiTokenizer wikiTokenizer) {
247         }
248         
249         EntryTypeName sectionEntryTypeName;
250         IndexBuilder currentIndexBuilder;
251
252         @Override
253         public void onHeading(WikiTokenizer wikiTokenizer) {
254             final String headingText = wikiTokenizer.headingWikiText();
255             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
256             final int depth = wikiTokenizer.headingDepth();
257             if (langConfig.skipSection(headingText)) {
258                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
259                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
260                         wikiTokenizer.returnToLineStart();
261                         return;
262                     }
263                 }
264                 return;
265             }
266             builder.append(String.format("\n<h%d>", depth));
267             dispatch(headingText, null);
268             builder.append(String.format("</h%d>\n", depth));
269         }
270
271         final List<Character> listPrefixStack = new ArrayList<Character>();
272
273         @Override
274         public void onListItem(WikiTokenizer wikiTokenizer) {
275             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
276                 builder.append("\n");
277             }
278             final String prefix = wikiTokenizer.listItemPrefix();
279             while (listPrefixStack.size() < prefix.length()) {
280                 builder.append(String.format("<%s>",
281                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
282                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
283             }
284             builder.append("<li>");
285             dispatch(wikiTokenizer.listItemWikiText(), null);
286             builder.append("</li>\n");
287
288             WikiTokenizer nextToken = wikiTokenizer.nextToken();
289             boolean returnToLineStart = false;
290             if (nextToken != null && nextToken.isNewline()) {
291                 nextToken = nextToken.nextToken();
292                 returnToLineStart = true;
293             }
294             final String nextListHeader;
295             if (nextToken == null || !nextToken.isListItem()) {
296                 nextListHeader = "";
297             } else {
298                 nextListHeader = nextToken.listItemPrefix();
299             }
300             if (returnToLineStart) {
301                 wikiTokenizer.returnToLineStart();
302             }
303             while (listPrefixStack.size() > nextListHeader.length()) {
304                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
305                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
306             }
307         }
308
309         boolean boldOn = false;
310         boolean italicOn = false;
311
312         @Override
313         public void onMarkup(WikiTokenizer wikiTokenizer) {
314             if ("'''".equals(wikiTokenizer.token())) {
315                 if (!boldOn) {
316                     builder.append("<b>");
317                 } else {
318                     builder.append("</b>");
319                 }
320                 boldOn = !boldOn;
321             } else if ("''".equals(wikiTokenizer.token())) {
322                 if (!italicOn) {
323                     builder.append("<em>");
324                 } else {
325                     builder.append("</em>");
326                 }
327                 italicOn = !italicOn;
328             } else {
329                 assert false;
330             }
331         }
332
333     }
334
335 }