]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
it-conj (most of the way), unicode handling in strings.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
9 import com.hughes.util.StringUtil;
10
11 import org.apache.commons.lang3.StringEscapeUtils;
12 import org.apache.commons.lang3.StringUtils;
13
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.regex.Pattern;
19
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
21
22     public static final String NAME = "WholeSectionToHtmlParser";
23
24     interface LangConfig {
25         boolean skipSection(final String name);
26         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
27         String adjustWikiLink(String wikiLinkDest);
28         void addFunctionCallbacks(
29                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
30     }
31     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
32     static {
33         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
34         isoToLangConfig.put("EN", new LangConfig() {
35             @Override
36             public boolean skipSection(String headingText) {
37                 return enSkipSections.matcher(headingText).matches();
38             }
39
40             @Override
41             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
42                 final String wikiText = wikiTokenizer.wikiLinkText();
43                 if (wikiText.startsWith("Category:")) {
44                     return true;
45                 }
46                 return false;
47             }
48             @Override
49             public String adjustWikiLink(String wikiLinkDest) {
50                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
51                     return null;
52                 }
53                 return wikiLinkDest;
54             }
55
56             @Override
57             public void addFunctionCallbacks(
58                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
59                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
60             }});
61     }
62
63     final IndexBuilder titleIndexBuilder;
64     final String skipLangIso;
65     final LangConfig langConfig;
66
67     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
68         this.titleIndexBuilder = titleIndexBuilder;
69         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
70         this.langConfig = isoToLangConfig.get(wiktionaryIso);
71         this.skipLangIso = skipLangIso;
72     }
73
74     @Override
75     void parseSection(String heading, String text) {
76         HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
77         IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
78
79         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
80                 this);
81         langConfig.addFunctionCallbacks(callback.functionCallbacks);
82
83         callback.builder = new StringBuilder();
84         callback.indexedEntry = indexedEntry;
85         callback.dispatch(text, null);
86
87         htmlEntry.html = callback.builder.toString();
88         indexedEntry.isValid = true;
89
90         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
91
92         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
93         tokenData.htmlEntries.add(htmlEntry);
94         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
95         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
96     }
97
98     @Override
99     void removeUselessArgs(Map<String, String> namedArgs) {
100     }
101     
102     static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
103
104     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
105         public AppendCallback(WholeSectionToHtmlParser parser) {
106             super(parser);
107         }
108
109         @Override
110         public void onPlainText(String plainText) {
111             final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
112             if (ALL_ASCII.matcher(htmlEscaped).matches()) {
113                 super.onPlainText(htmlEscaped);
114             } else { 
115                 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
116             }
117         }
118
119         @Override
120         public void onWikiLink(WikiTokenizer wikiTokenizer) {
121             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
122                 // Skips wikilinks like: [[en::dick]]
123                 return;
124             }
125             if (langConfig.skipWikiLink(wikiTokenizer)) {
126                 return;
127             }
128             String linkDest;
129             if (wikiTokenizer.wikiLinkDest() != null) {
130                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
131             } else {
132                 linkDest = wikiTokenizer.wikiLinkText();
133             }
134             if (linkDest != null) {
135                 builder.append(String.format("<a href=\"%s\">", linkDest));
136                 super.onWikiLink(wikiTokenizer);
137                 builder.append(String.format("</a>"));
138             } else {
139                 super.onWikiLink(wikiTokenizer);
140             }
141         }
142
143         @Override
144         public void onFunction(WikiTokenizer wikiTokenizer, String name,
145                 List<String> args, Map<String, String> namedArgs) {
146             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
147                 namedArgs.remove("lang");
148             }
149             super.onFunction(wikiTokenizer, name, args, namedArgs);
150         }
151
152         @Override
153         public void onHtml(WikiTokenizer wikiTokenizer) {
154             super.onHtml(wikiTokenizer);
155         }
156
157         @Override
158         public void onNewline(WikiTokenizer wikiTokenizer) {
159         }
160
161         @Override
162         public void onHeading(WikiTokenizer wikiTokenizer) {
163             final String headingText = wikiTokenizer.headingWikiText();
164             final int depth = wikiTokenizer.headingDepth();
165             if (langConfig.skipSection(headingText)) {
166                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
167                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
168                         wikiTokenizer.returnToLineStart();
169                         return;
170                     }
171                 }
172                 return;
173             }
174             builder.append(String.format("\n<h%d>", depth));
175             dispatch(headingText, null);
176             builder.append(String.format("</h%d>\n", depth));
177         }
178
179         final List<Character> listPrefixStack = new ArrayList<Character>();
180
181         @Override
182         public void onListItem(WikiTokenizer wikiTokenizer) {
183             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
184                 builder.append("\n");
185             }
186             final String prefix = wikiTokenizer.listItemPrefix();
187             while (listPrefixStack.size() < prefix.length()) {
188                 builder.append(String.format("<%s>",
189                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
190                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
191             }
192             builder.append("<li>");
193             dispatch(wikiTokenizer.listItemWikiText(), null);
194             builder.append("</li>\n");
195
196             WikiTokenizer nextToken = wikiTokenizer.nextToken();
197             boolean returnToLineStart = false;
198             if (nextToken != null && nextToken.isNewline()) {
199                 nextToken = nextToken.nextToken();
200                 returnToLineStart = true;
201             }
202             final String nextListHeader;
203             if (nextToken == null || !nextToken.isListItem()) {
204                 nextListHeader = "";
205             } else {
206                 nextListHeader = nextToken.listItemPrefix();
207             }
208             if (returnToLineStart) {
209                 wikiTokenizer.returnToLineStart();
210             }
211             while (listPrefixStack.size() > nextListHeader.length()) {
212                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
213                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
214             }
215         }
216
217         boolean boldOn = false;
218         boolean italicOn = false;
219
220         @Override
221         public void onMarkup(WikiTokenizer wikiTokenizer) {
222             if ("'''".equals(wikiTokenizer.token())) {
223                 if (!boldOn) {
224                     builder.append("<b>");
225                 } else {
226                     builder.append("</b>");
227                 }
228                 boldOn = !boldOn;
229             } else if ("''".equals(wikiTokenizer.token())) {
230                 if (!italicOn) {
231                     builder.append("<em>");
232                 } else {
233                     builder.append("</em>");
234                 }
235                 italicOn = !italicOn;
236             } else {
237                 assert false;
238             }
239         }
240
241     }
242
243 }