]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
it-noun.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11
12 import org.apache.commons.lang3.StringEscapeUtils;
13
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.regex.Pattern;
19
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
21
22     public static final String NAME = "WholeSectionToHtmlParser";
23
24     interface LangConfig {
25         boolean skipSection(final String name);
26         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
27         String adjustWikiLink(String wikiLinkDest);
28         void addFunctionCallbacks(
29                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
30     }
31     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
32     static {
33         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
34         isoToLangConfig.put("EN", new LangConfig() {
35             @Override
36             public boolean skipSection(String headingText) {
37                 return enSkipSections.matcher(headingText).matches();
38             }
39
40             @Override
41             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
42                 final String wikiText = wikiTokenizer.wikiLinkText();
43                 if (wikiText.startsWith("Category:")) {
44                     return true;
45                 }
46                 return false;
47             }
48             @Override
49             public String adjustWikiLink(String wikiLinkDest) {
50                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
51                     return null;
52                 }
53                 return wikiLinkDest;
54             }
55
56             @Override
57             public void addFunctionCallbacks(
58                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
59                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
60             }});
61         
62         final LangConfig basicLangConfig = new LangConfig() {
63             @Override
64             public boolean skipSection(String headingText) {
65                 return false;
66             }
67
68             @Override
69             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
70                 final String wikiText = wikiTokenizer.wikiLinkText();
71                 if (wikiText.startsWith("Category:")) {
72                     return true;
73                 }
74                 return false;
75             }
76             @Override
77             public String adjustWikiLink(String wikiLinkDest) {
78                 return wikiLinkDest;
79             }
80
81             @Override
82             public void addFunctionCallbacks(
83                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
84             }
85         };
86         isoToLangConfig.put("FR", basicLangConfig);
87         isoToLangConfig.put("DE", basicLangConfig);
88         isoToLangConfig.put("IT", basicLangConfig);
89     }
90
91     final IndexBuilder titleIndexBuilder;
92     final String skipLangIso;
93     final LangConfig langConfig;
94
95     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
96         this.titleIndexBuilder = titleIndexBuilder;
97         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
98         this.langConfig = isoToLangConfig.get(wiktionaryIso);
99         this.skipLangIso = skipLangIso;
100     }
101     
102     IndexedEntry indexedEntry = null;
103
104     @Override
105     public void parseSection(String heading, String text) {
106         assert entrySource != null;
107         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
108         indexedEntry = new IndexedEntry(htmlEntry);
109
110         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
111                 this);
112         langConfig.addFunctionCallbacks(callback.functionCallbacks);
113
114         callback.builder = new StringBuilder();
115         callback.indexedEntry = indexedEntry;
116         callback.dispatch(text, null);
117
118         htmlEntry.html = callback.builder.toString();
119         indexedEntry.isValid = true;
120
121         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
122
123         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
124         tokenData.htmlEntries.add(htmlEntry);
125         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
126         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
127         
128         indexedEntry = null;
129     }
130
131     @Override
132     void removeUselessArgs(Map<String, String> namedArgs) {
133     }
134     
135     @Override
136     public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
137         titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
138     }
139
140
141
142     static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
143
144     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
145         public AppendCallback(WholeSectionToHtmlParser parser) {
146             super(parser);
147         }
148
149         @Override
150         public void onPlainText(String plainText) {
151             final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
152             if (ALL_ASCII.matcher(htmlEscaped).matches()) {
153                 super.onPlainText(htmlEscaped);
154             } else { 
155                 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
156             }
157         }
158
159         @Override
160         public void onWikiLink(WikiTokenizer wikiTokenizer) {
161             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
162                 // Skips wikilinks like: [[en::dick]]
163                 return;
164             }
165             if (langConfig.skipWikiLink(wikiTokenizer)) {
166                 return;
167             }
168             String linkDest;
169             if (wikiTokenizer.wikiLinkDest() != null) {
170                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
171             } else {
172                 linkDest = wikiTokenizer.wikiLinkText();
173             }
174             if (linkDest != null) {
175                 builder.append(String.format("<a href=\"%s\">", linkDest));
176                 super.onWikiLink(wikiTokenizer);
177                 builder.append(String.format("</a>"));
178             } else {
179                 super.onWikiLink(wikiTokenizer);
180             }
181         }
182
183         @Override
184         public void onFunction(WikiTokenizer wikiTokenizer, String name,
185                 List<String> args, Map<String, String> namedArgs) {
186             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
187                 namedArgs.remove("lang");
188             }
189             super.onFunction(wikiTokenizer, name, args, namedArgs);
190         }
191
192         @Override
193         public void onHtml(WikiTokenizer wikiTokenizer) {
194             super.onHtml(wikiTokenizer);
195         }
196
197         @Override
198         public void onNewline(WikiTokenizer wikiTokenizer) {
199         }
200
201         @Override
202         public void onHeading(WikiTokenizer wikiTokenizer) {
203             final String headingText = wikiTokenizer.headingWikiText();
204             final int depth = wikiTokenizer.headingDepth();
205             if (langConfig.skipSection(headingText)) {
206                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
207                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
208                         wikiTokenizer.returnToLineStart();
209                         return;
210                     }
211                 }
212                 return;
213             }
214             builder.append(String.format("\n<h%d>", depth));
215             dispatch(headingText, null);
216             builder.append(String.format("</h%d>\n", depth));
217         }
218
219         final List<Character> listPrefixStack = new ArrayList<Character>();
220
221         @Override
222         public void onListItem(WikiTokenizer wikiTokenizer) {
223             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
224                 builder.append("\n");
225             }
226             final String prefix = wikiTokenizer.listItemPrefix();
227             while (listPrefixStack.size() < prefix.length()) {
228                 builder.append(String.format("<%s>",
229                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
230                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
231             }
232             builder.append("<li>");
233             dispatch(wikiTokenizer.listItemWikiText(), null);
234             builder.append("</li>\n");
235
236             WikiTokenizer nextToken = wikiTokenizer.nextToken();
237             boolean returnToLineStart = false;
238             if (nextToken != null && nextToken.isNewline()) {
239                 nextToken = nextToken.nextToken();
240                 returnToLineStart = true;
241             }
242             final String nextListHeader;
243             if (nextToken == null || !nextToken.isListItem()) {
244                 nextListHeader = "";
245             } else {
246                 nextListHeader = nextToken.listItemPrefix();
247             }
248             if (returnToLineStart) {
249                 wikiTokenizer.returnToLineStart();
250             }
251             while (listPrefixStack.size() > nextListHeader.length()) {
252                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
253                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
254             }
255         }
256
257         boolean boldOn = false;
258         boolean italicOn = false;
259
260         @Override
261         public void onMarkup(WikiTokenizer wikiTokenizer) {
262             if ("'''".equals(wikiTokenizer.token())) {
263                 if (!boldOn) {
264                     builder.append("<b>");
265                 } else {
266                     builder.append("</b>");
267                 }
268                 boldOn = !boldOn;
269             } else if ("''".equals(wikiTokenizer.token())) {
270                 if (!italicOn) {
271                     builder.append("<em>");
272                 } else {
273                     builder.append("</em>");
274                 }
275                 italicOn = !italicOn;
276             } else {
277                 assert false;
278             }
279         }
280
281     }
282
283 }