]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Synonyms, antonyms.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11
12 import org.apache.commons.lang3.StringEscapeUtils;
13
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.regex.Pattern;
19
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
21
22     public static final String NAME = "WholeSectionToHtmlParser";
23
24     interface LangConfig {
25         boolean skipSection(final String name);
26         EntryTypeName sectionNameToEntryType(String sectionName);
27         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
28         String adjustWikiLink(String wikiLinkDest);
29         void addFunctionCallbacks(
30                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
31     }
32     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
33     static {
34         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
35         isoToLangConfig.put("EN", new LangConfig() {
36             @Override
37             public boolean skipSection(String headingText) {
38                 return enSkipSections.matcher(headingText).matches();
39             }
40             
41             @Override
42             public EntryTypeName sectionNameToEntryType(String sectionName) {
43                 if (sectionName.equalsIgnoreCase("Synonyms")) {
44                     return EntryTypeName.SYNONYM_MULTI;
45                 }
46                 if (sectionName.equalsIgnoreCase("Antonyms")) {
47                     return EntryTypeName.ANTONYM_MULTI;
48                 }
49                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
50                     // We need to put it in the other index, too.
51                     return null;
52                 }
53                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
54                     return null;
55                 }
56                 return null;
57             }
58
59             @Override
60             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
61                 final String wikiText = wikiTokenizer.wikiLinkText();
62                 if (wikiText.startsWith("Category:")) {
63                     return true;
64                 }
65                 return false;
66             }
67             @Override
68             public String adjustWikiLink(String wikiLinkDest) {
69                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
70                     return null;
71                 }
72                 return wikiLinkDest;
73             }
74
75             @Override
76             public void addFunctionCallbacks(
77                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
78                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
79             }});
80         
81         final LangConfig basicLangConfig = new LangConfig() {
82             @Override
83             public boolean skipSection(String headingText) {
84                 return false;
85             }
86             @Override
87             public EntryTypeName sectionNameToEntryType(String sectionName) {
88                 return EntryTypeName.WIKTIONARY_MENTIONED;
89             }
90             @Override
91             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
92                 final String wikiText = wikiTokenizer.wikiLinkText();
93                 if (wikiText.startsWith("Category:")) {
94                     return true;
95                 }
96                 return false;
97             }
98             @Override
99             public String adjustWikiLink(String wikiLinkDest) {
100                 return wikiLinkDest;
101             }
102
103             @Override
104             public void addFunctionCallbacks(
105                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
106             }
107         };
108         isoToLangConfig.put("FR", basicLangConfig);
109         isoToLangConfig.put("DE", basicLangConfig);
110         isoToLangConfig.put("IT", basicLangConfig);
111     }
112
113     final IndexBuilder titleIndexBuilder;
114     final String skipLangIso;
115     final LangConfig langConfig;
116
117     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
118         this.titleIndexBuilder = titleIndexBuilder;
119         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
120         this.langConfig = isoToLangConfig.get(wiktionaryIso);
121         this.skipLangIso = skipLangIso;
122     }
123     
124     IndexedEntry indexedEntry = null;
125
126     @Override
127     public void parseSection(String heading, String text) {
128         assert entrySource != null;
129         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
130         indexedEntry = new IndexedEntry(htmlEntry);
131
132         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
133                 this);
134         langConfig.addFunctionCallbacks(callback.functionCallbacks);
135
136         callback.builder = new StringBuilder();
137         callback.indexedEntry = indexedEntry;
138         callback.dispatch(text, null);
139
140         htmlEntry.html = callback.builder.toString();
141         indexedEntry.isValid = true;
142
143         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
144
145         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
146         tokenData.htmlEntries.add(htmlEntry);
147         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
148         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
149         
150         indexedEntry = null;
151     }
152
153     @Override
154     void removeUselessArgs(Map<String, String> namedArgs) {
155     }
156     
157     @Override
158     public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
159         titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
160     }
161
162
163
164     static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
165
166     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
167         public AppendCallback(WholeSectionToHtmlParser parser) {
168             super(parser);
169         }
170
171         @Override
172         public void onPlainText(String plainText) {
173             final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
174             if (ALL_ASCII.matcher(htmlEscaped).matches()) {
175                 super.onPlainText(htmlEscaped);
176             } else { 
177                 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
178             }
179         }
180
181         @Override
182         public void onWikiLink(WikiTokenizer wikiTokenizer) {
183             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
184                 // Skips wikilinks like: [[en::dick]]
185                 return;
186             }
187             if (langConfig.skipWikiLink(wikiTokenizer)) {
188                 return;
189             }
190             String linkDest;
191             if (wikiTokenizer.wikiLinkDest() != null) {
192                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
193             } else {
194                 linkDest = wikiTokenizer.wikiLinkText();
195             }
196             if (sectionEntryTypeName != null) {
197                 // TODO: inside a definition, this could be the wrong language.
198                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
199             }
200             if (linkDest != null) {
201                 builder.append(String.format("<a href=\"%s\">", linkDest));
202                 super.onWikiLink(wikiTokenizer);
203                 builder.append(String.format("</a>"));
204             } else {
205                 super.onWikiLink(wikiTokenizer);
206             }
207         }
208
209         @Override
210         public void onFunction(WikiTokenizer wikiTokenizer, String name,
211                 List<String> args, Map<String, String> namedArgs) {
212             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
213                 namedArgs.remove("lang");
214             }
215             super.onFunction(wikiTokenizer, name, args, namedArgs);
216         }
217
218         @Override
219         public void onHtml(WikiTokenizer wikiTokenizer) {
220             super.onHtml(wikiTokenizer);
221         }
222
223         @Override
224         public void onNewline(WikiTokenizer wikiTokenizer) {
225         }
226         
227         EntryTypeName sectionEntryTypeName;
228
229         @Override
230         public void onHeading(WikiTokenizer wikiTokenizer) {
231             final String headingText = wikiTokenizer.headingWikiText();
232             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
233             final int depth = wikiTokenizer.headingDepth();
234             if (langConfig.skipSection(headingText)) {
235                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
236                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
237                         wikiTokenizer.returnToLineStart();
238                         return;
239                     }
240                 }
241                 return;
242             }
243             builder.append(String.format("\n<h%d>", depth));
244             dispatch(headingText, null);
245             builder.append(String.format("</h%d>\n", depth));
246         }
247
248         final List<Character> listPrefixStack = new ArrayList<Character>();
249
250         @Override
251         public void onListItem(WikiTokenizer wikiTokenizer) {
252             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
253                 builder.append("\n");
254             }
255             final String prefix = wikiTokenizer.listItemPrefix();
256             while (listPrefixStack.size() < prefix.length()) {
257                 builder.append(String.format("<%s>",
258                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
259                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
260             }
261             builder.append("<li>");
262             dispatch(wikiTokenizer.listItemWikiText(), null);
263             builder.append("</li>\n");
264
265             WikiTokenizer nextToken = wikiTokenizer.nextToken();
266             boolean returnToLineStart = false;
267             if (nextToken != null && nextToken.isNewline()) {
268                 nextToken = nextToken.nextToken();
269                 returnToLineStart = true;
270             }
271             final String nextListHeader;
272             if (nextToken == null || !nextToken.isListItem()) {
273                 nextListHeader = "";
274             } else {
275                 nextListHeader = nextToken.listItemPrefix();
276             }
277             if (returnToLineStart) {
278                 wikiTokenizer.returnToLineStart();
279             }
280             while (listPrefixStack.size() > nextListHeader.length()) {
281                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
282                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
283             }
284         }
285
286         boolean boldOn = false;
287         boolean italicOn = false;
288
289         @Override
290         public void onMarkup(WikiTokenizer wikiTokenizer) {
291             if ("'''".equals(wikiTokenizer.token())) {
292                 if (!boldOn) {
293                     builder.append("<b>");
294                 } else {
295                     builder.append("</b>");
296                 }
297                 boldOn = !boldOn;
298             } else if ("''".equals(wikiTokenizer.token())) {
299                 if (!italicOn) {
300                     builder.append("<em>");
301                 } else {
302                     builder.append("</em>");
303                 }
304                 italicOn = !italicOn;
305             } else {
306                 assert false;
307             }
308         }
309
310     }
311
312 }