2 package com.hughes.android.dictionary.parser.wiktionary;
4 import com.hughes.android.dictionary.HtmlDisplayActivity;
5 import com.hughes.android.dictionary.engine.EntryTypeName;
6 import com.hughes.android.dictionary.engine.HtmlEntry;
7 import com.hughes.android.dictionary.engine.IndexBuilder;
8 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
9 import com.hughes.android.dictionary.engine.IndexedEntry;
10 import com.hughes.android.dictionary.parser.WikiTokenizer;
11 import com.hughes.util.StringUtil;
13 import org.apache.commons.lang3.StringEscapeUtils;
15 import java.util.ArrayList;
16 import java.util.LinkedHashMap;
17 import java.util.List;
19 import java.util.regex.Pattern;
21 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
23 public static final String NAME = "WholeSectionToHtmlParser";
25 interface LangConfig {
26 boolean skipSection(final String name);
27 EntryTypeName sectionNameToEntryType(String sectionName);
28 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
29 String adjustWikiLink(String wikiLinkDest, final String wikiLinkText);
30 void addFunctionCallbacks(
31 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
33 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
35 final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
36 isoToLangConfig.put("EN", new LangConfig() {
38 public boolean skipSection(String headingText) {
39 return enSkipSections.matcher(headingText).matches();
43 public EntryTypeName sectionNameToEntryType(String sectionName) {
44 if (sectionName.equalsIgnoreCase("Synonyms")) {
45 return EntryTypeName.SYNONYM_MULTI;
47 if (sectionName.equalsIgnoreCase("Antonyms")) {
48 return EntryTypeName.ANTONYM_MULTI;
50 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
51 // We need to put it in the other index, too.
54 if (sectionName.equalsIgnoreCase("Derived Terms")) {
61 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
62 final String wikiText = wikiTokenizer.wikiLinkText();
63 if (wikiText.startsWith("Category:")) {
69 public String adjustWikiLink(String wikiLinkDest, String wikiLinkText) {
70 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
73 final int hashPos = wikiLinkDest.indexOf("#");
75 wikiLinkDest = wikiLinkDest.substring(0, hashPos);
76 if (wikiLinkDest.isEmpty()) {
77 wikiLinkDest = wikiLinkText;
84 public void addFunctionCallbacks(
85 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
86 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
90 final LangConfig basicLangConfig = new LangConfig() {
92 public boolean skipSection(String headingText) {
96 public EntryTypeName sectionNameToEntryType(String sectionName) {
97 return EntryTypeName.WIKTIONARY_MENTIONED;
100 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
101 final String wikiText = wikiTokenizer.wikiLinkText();
102 if (wikiText.startsWith("Category:")) {
108 public String adjustWikiLink(String wikiLinkDest, final String wikiLinkText) {
113 public void addFunctionCallbacks(
114 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
117 isoToLangConfig.put("FR", basicLangConfig);
118 isoToLangConfig.put("DE", basicLangConfig);
119 isoToLangConfig.put("IT", basicLangConfig);
122 final IndexBuilder titleIndexBuilder;
123 final IndexBuilder defIndexBuilder;
124 final String skipLangIso;
125 final LangConfig langConfig;
128 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final IndexBuilder defIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
129 this.titleIndexBuilder = titleIndexBuilder;
130 this.defIndexBuilder = defIndexBuilder;
131 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
132 this.langConfig = isoToLangConfig.get(wiktionaryIso);
133 this.skipLangIso = skipLangIso;
136 IndexedEntry indexedEntry = null;
139 public void parseSection(String heading, String text) {
140 assert entrySource != null;
141 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, title);
142 indexedEntry = new IndexedEntry(htmlEntry);
144 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
146 langConfig.addFunctionCallbacks(callback.functionCallbacks);
148 callback.builder = new StringBuilder();
149 callback.indexedEntry = indexedEntry;
150 callback.dispatch(text, null);
152 htmlEntry.html = callback.builder.toString();
153 indexedEntry.isValid = true;
155 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
157 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
158 tokenData.htmlEntries.add(htmlEntry);
159 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
160 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
166 void removeUselessArgs(Map<String, String> namedArgs) {
170 public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
171 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
176 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
177 public AppendCallback(WholeSectionToHtmlParser parser) {
182 public void onPlainText(String plainText) {
183 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
184 if (StringUtil.isAscii(htmlEscaped)) {
185 super.onPlainText(htmlEscaped);
187 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
192 public void onWikiLink(WikiTokenizer wikiTokenizer) {
193 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
194 // Skips wikilinks like: [[en::dick]]
197 if (langConfig.skipWikiLink(wikiTokenizer)) {
201 if (wikiTokenizer.wikiLinkDest() != null) {
202 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest(), wikiTokenizer.wikiLinkText());
204 linkDest = wikiTokenizer.wikiLinkText();
206 if (sectionEntryTypeName != null) {
207 // TODO: inside a definition, this could be the wrong language.
208 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
210 if (linkDest != null) {
211 builder.append(String.format("<a href=\"%s\">", HtmlEntry.formatQuickdicUrl("", linkDest)));
212 super.onWikiLink(wikiTokenizer);
213 builder.append(String.format("</a>"));
215 super.onWikiLink(wikiTokenizer);
220 public void onFunction(WikiTokenizer wikiTokenizer, String name,
221 List<String> args, Map<String, String> namedArgs) {
222 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
223 namedArgs.remove("lang");
225 super.onFunction(wikiTokenizer, name, args, namedArgs);
229 public void onHtml(WikiTokenizer wikiTokenizer) {
230 super.onHtml(wikiTokenizer);
234 public void onNewline(WikiTokenizer wikiTokenizer) {
237 EntryTypeName sectionEntryTypeName;
238 IndexBuilder currentIndexBuilder;
241 public void onHeading(WikiTokenizer wikiTokenizer) {
242 final String headingText = wikiTokenizer.headingWikiText();
243 sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
244 final int depth = wikiTokenizer.headingDepth();
245 if (langConfig.skipSection(headingText)) {
246 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
247 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
248 wikiTokenizer.returnToLineStart();
254 builder.append(String.format("\n<h%d>", depth));
255 dispatch(headingText, null);
256 builder.append(String.format("</h%d>\n", depth));
259 final List<Character> listPrefixStack = new ArrayList<Character>();
262 public void onListItem(WikiTokenizer wikiTokenizer) {
263 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
264 builder.append("\n");
266 final String prefix = wikiTokenizer.listItemPrefix();
267 while (listPrefixStack.size() < prefix.length()) {
268 builder.append(String.format("<%s>",
269 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
270 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
272 builder.append("<li>");
273 dispatch(wikiTokenizer.listItemWikiText(), null);
274 builder.append("</li>\n");
276 WikiTokenizer nextToken = wikiTokenizer.nextToken();
277 boolean returnToLineStart = false;
278 if (nextToken != null && nextToken.isNewline()) {
279 nextToken = nextToken.nextToken();
280 returnToLineStart = true;
282 final String nextListHeader;
283 if (nextToken == null || !nextToken.isListItem()) {
286 nextListHeader = nextToken.listItemPrefix();
288 if (returnToLineStart) {
289 wikiTokenizer.returnToLineStart();
291 while (listPrefixStack.size() > nextListHeader.length()) {
292 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
293 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
297 boolean boldOn = false;
298 boolean italicOn = false;
301 public void onMarkup(WikiTokenizer wikiTokenizer) {
302 if ("'''".equals(wikiTokenizer.token())) {
304 builder.append("<b>");
306 builder.append("</b>");
309 } else if ("''".equals(wikiTokenizer.token())) {
311 builder.append("<em>");
313 builder.append("</em>");
315 italicOn = !italicOn;