]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Format links properly.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.HtmlDisplayActivity;
5 import com.hughes.android.dictionary.engine.EntryTypeName;
6 import com.hughes.android.dictionary.engine.HtmlEntry;
7 import com.hughes.android.dictionary.engine.IndexBuilder;
8 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
9 import com.hughes.android.dictionary.engine.IndexedEntry;
10 import com.hughes.android.dictionary.parser.WikiTokenizer;
11 import com.hughes.util.StringUtil;
12
13 import org.apache.commons.lang3.StringEscapeUtils;
14
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.regex.Pattern;
20
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22
23     public static final String NAME = "WholeSectionToHtmlParser";
24
25     interface LangConfig {
26         boolean skipSection(final String name);
27         EntryTypeName sectionNameToEntryType(String sectionName);
28         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29         String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30         void addFunctionCallbacks(
31                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32     }
33     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34     static {
35         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
36         isoToLangConfig.put("EN", new LangConfig() {
37             @Override
38             public boolean skipSection(String headingText) {
39                 return enSkipSections.matcher(headingText).matches();
40             }
41             
42             @Override
43             public EntryTypeName sectionNameToEntryType(String sectionName) {
44                 if (sectionName.equalsIgnoreCase("Synonyms")) {
45                     return EntryTypeName.SYNONYM_MULTI;
46                 }
47                 if (sectionName.equalsIgnoreCase("Antonyms")) {
48                     return EntryTypeName.ANTONYM_MULTI;
49                 }
50                 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51                     // We need to put it in the other index, too.
52                     return null;
53                 }
54                 if (sectionName.equalsIgnoreCase("Derived Terms")) {
55                     return null;
56                 }
57                 return null;
58             }
59             
60             @Override
61             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62                 final String wikiText = wikiTokenizer.wikiLinkText();
63                 if (wikiText.startsWith("Category:")) {
64                     return true;
65                 }
66                 return false;
67             }
68             @Override
69             public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
71                     return null;
72                 }
73                 final int hashPos = wikiLinkDest.indexOf("#");
74                 if (hashPos != -1) {
75                     wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76                     if (wikiLinkDest.isEmpty()) {
77                         wikiLinkDest = wikiLinkText;
78                     }
79                 }
80                 return wikiLinkDest;
81             }
82
83             @Override
84             public void addFunctionCallbacks(
85                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
87             }
88         });
89         
90         final LangConfig basicLangConfig = new LangConfig() {
91             @Override
92             public boolean skipSection(String headingText) {
93                 return false;
94             }
95             @Override
96             public EntryTypeName sectionNameToEntryType(String sectionName) {
97                 return EntryTypeName.WIKTIONARY_MENTIONED;
98             }
99             @Override
100             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
101                 final String wikiText = wikiTokenizer.wikiLinkText();
102                 if (wikiText.startsWith("Category:")) {
103                     return true;
104                 }
105                 return false;
106             }
107             @Override
108             public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
109                 return wikiLinkDest;
110             }
111
112             @Override
113             public void addFunctionCallbacks(
114                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
115             }
116         };
117         isoToLangConfig.put("FR", basicLangConfig);
118         isoToLangConfig.put("DE", basicLangConfig);
119         isoToLangConfig.put("IT", basicLangConfig);
120     }
121
122     final IndexBuilder titleIndexBuilder;
123     final IndexBuilder defIndexBuilder;
124     final String skipLangIso;
125     final LangConfig langConfig;
126     
127
128     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
129         this.titleIndexBuilder = titleIndexBuilder;
130         this.defIndexBuilder = defIndexBuilder;
131         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
132         this.langConfig = isoToLangConfig.get(wiktionaryIso);
133         this.skipLangIso = skipLangIso;
134     }
135     
136     IndexedEntry indexedEntry = null;
137
138     @Override
139     public void parseSection(String heading, String text) {
140         assert entrySource != null;
141         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
142         indexedEntry = new IndexedEntry(htmlEntry);
143
144         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
145                 this);
146         langConfig.addFunctionCallbacks(callback.functionCallbacks);
147
148         callback.builder = new StringBuilder();
149         callback.indexedEntry = indexedEntry;
150         callback.dispatch(text, null);
151
152         htmlEntry.html = callback.builder.toString();
153         indexedEntry.isValid = true;
154
155         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
156
157         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
158         tokenData.htmlEntries.add(htmlEntry);
159         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
160         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
161         
162         indexedEntry = null;
163     }
164
165     @Override
166     void removeUselessArgs(Map<String, String> namedArgs) {
167     }
168     
169     @Override
170     public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
171         titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
172     }
173
174
175
176     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
177         public AppendCallback(WholeSectionToHtmlParser parser) {
178             super(parser);
179         }
180
181         @Override
182         public void onPlainText(String plainText) {
183             final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
184             if (StringUtil.isAscii(htmlEscaped)) {
185                 super.onPlainText(htmlEscaped);
186             } else { 
187                 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
188             }
189         }
190
191         @Override
192         public void onWikiLink(WikiTokenizer wikiTokenizer) {
193             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
194                 // Skips wikilinks like: [[en::dick]]
195                 return;
196             }
197             if (langConfig.skipWikiLink(wikiTokenizer)) {
198                 return;
199             }
200             String linkDest;
201             if (wikiTokenizer.wikiLinkDest() != null) {
202                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
203             } else {
204                 linkDest = wikiTokenizer.wikiLinkText();
205             }
206             if (sectionEntryTypeName != null) {
207                 // TODO: inside a definition, this could be the wrong language.
208                 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
209             }
210             if (linkDest != null) {
211                 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
212                 super.onWikiLink(wikiTokenizer);
213                 builder.append(String.format("</a>"));
214             } else {
215                 super.onWikiLink(wikiTokenizer);
216             }
217         }
218
219         @Override
220         public void onFunction(WikiTokenizer wikiTokenizer, String name,
221                 List<String> args, Map<String, String> namedArgs) {
222             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
223                 namedArgs.remove("lang");
224             }
225             super.onFunction(wikiTokenizer, name, args, namedArgs);
226         }
227
228         @Override
229         public void onHtml(WikiTokenizer wikiTokenizer) {
230             super.onHtml(wikiTokenizer);
231         }
232
233         @Override
234         public void onNewline(WikiTokenizer wikiTokenizer) {
235         }
236         
237         EntryTypeName sectionEntryTypeName;
238         IndexBuilder currentIndexBuilder;
239
240         @Override
241         public void onHeading(WikiTokenizer wikiTokenizer) {
242             final String headingText = wikiTokenizer.headingWikiText();
243             sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
244             final int depth = wikiTokenizer.headingDepth();
245             if (langConfig.skipSection(headingText)) {
246                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
247                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
248                         wikiTokenizer.returnToLineStart();
249                         return;
250                     }
251                 }
252                 return;
253             }
254             builder.append(String.format("\n<h%d>", depth));
255             dispatch(headingText, null);
256             builder.append(String.format("</h%d>\n", depth));
257         }
258
259         final List<Character> listPrefixStack = new ArrayList<Character>();
260
261         @Override
262         public void onListItem(WikiTokenizer wikiTokenizer) {
263             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
264                 builder.append("\n");
265             }
266             final String prefix = wikiTokenizer.listItemPrefix();
267             while (listPrefixStack.size() < prefix.length()) {
268                 builder.append(String.format("<%s>",
269                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
270                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
271             }
272             builder.append("<li>");
273             dispatch(wikiTokenizer.listItemWikiText(), null);
274             builder.append("</li>\n");
275
276             WikiTokenizer nextToken = wikiTokenizer.nextToken();
277             boolean returnToLineStart = false;
278             if (nextToken != null && nextToken.isNewline()) {
279                 nextToken = nextToken.nextToken();
280                 returnToLineStart = true;
281             }
282             final String nextListHeader;
283             if (nextToken == null || !nextToken.isListItem()) {
284                 nextListHeader = "";
285             } else {
286                 nextListHeader = nextToken.listItemPrefix();
287             }
288             if (returnToLineStart) {
289                 wikiTokenizer.returnToLineStart();
290             }
291             while (listPrefixStack.size() > nextListHeader.length()) {
292                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
293                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
294             }
295         }
296
297         boolean boldOn = false;
298         boolean italicOn = false;
299
300         @Override
301         public void onMarkup(WikiTokenizer wikiTokenizer) {
302             if ("'''".equals(wikiTokenizer.token())) {
303                 if (!boldOn) {
304                     builder.append("<b>");
305                 } else {
306                     builder.append("</b>");
307                 }
308                 boldOn = !boldOn;
309             } else if ("''".equals(wikiTokenizer.token())) {
310                 if (!italicOn) {
311                     builder.append("<em>");
312                 } else {
313                     builder.append("</em>");
314                 }
315                 italicOn = !italicOn;
316             } else {
317                 assert false;
318             }
319         }
320
321     }
322
323 }