2 package com.hughes.android.dictionary.parser.wiktionary;
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
12 import org.apache.commons.lang3.StringEscapeUtils;
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
18 import java.util.regex.Pattern;
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
22 public static final String NAME = "WholeSectionToHtmlParser";
24 interface LangConfig {
25 boolean skipSection(final String name);
26 EntryTypeName sectionNameToEntryType(String sectionName);
27 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
28 String adjustWikiLink(String wikiLinkDest);
29 void addFunctionCallbacks(
30 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
32 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
34 final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
35 isoToLangConfig.put("EN", new LangConfig() {
37 public boolean skipSection(String headingText) {
38 return enSkipSections.matcher(headingText).matches();
42 public EntryTypeName sectionNameToEntryType(String sectionName) {
43 if (sectionName.equalsIgnoreCase("Synonyms")) {
44 return EntryTypeName.SYNONYM_MULTI;
46 if (sectionName.equalsIgnoreCase("Antonyms")) {
47 return EntryTypeName.ANTONYM_MULTI;
49 if (EnParser.partOfSpeechHeader.matcher(sectionName).matches()) {
50 // We need to put it in the other index, too.
53 if (sectionName.equalsIgnoreCase("Derived Terms")) {
60 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
61 final String wikiText = wikiTokenizer.wikiLinkText();
62 if (wikiText.startsWith("Category:")) {
68 public String adjustWikiLink(String wikiLinkDest) {
69 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
76 public void addFunctionCallbacks(
77 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
78 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
81 final LangConfig basicLangConfig = new LangConfig() {
83 public boolean skipSection(String headingText) {
87 public EntryTypeName sectionNameToEntryType(String sectionName) {
88 return EntryTypeName.WIKTIONARY_MENTIONED;
91 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
92 final String wikiText = wikiTokenizer.wikiLinkText();
93 if (wikiText.startsWith("Category:")) {
99 public String adjustWikiLink(String wikiLinkDest) {
104 public void addFunctionCallbacks(
105 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
108 isoToLangConfig.put("FR", basicLangConfig);
109 isoToLangConfig.put("DE", basicLangConfig);
110 isoToLangConfig.put("IT", basicLangConfig);
113 final IndexBuilder titleIndexBuilder;
114 final String skipLangIso;
115 final LangConfig langConfig;
117 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
118 this.titleIndexBuilder = titleIndexBuilder;
119 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
120 this.langConfig = isoToLangConfig.get(wiktionaryIso);
121 this.skipLangIso = skipLangIso;
124 IndexedEntry indexedEntry = null;
127 public void parseSection(String heading, String text) {
128 assert entrySource != null;
129 final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
130 indexedEntry = new IndexedEntry(htmlEntry);
132 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
134 langConfig.addFunctionCallbacks(callback.functionCallbacks);
136 callback.builder = new StringBuilder();
137 callback.indexedEntry = indexedEntry;
138 callback.dispatch(text, null);
140 htmlEntry.html = callback.builder.toString();
141 indexedEntry.isValid = true;
143 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
145 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
146 tokenData.htmlEntries.add(htmlEntry);
147 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
148 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
154 void removeUselessArgs(Map<String, String> namedArgs) {
158 public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
159 titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
164 static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
166 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
167 public AppendCallback(WholeSectionToHtmlParser parser) {
172 public void onPlainText(String plainText) {
173 final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
174 if (ALL_ASCII.matcher(htmlEscaped).matches()) {
175 super.onPlainText(htmlEscaped);
177 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
182 public void onWikiLink(WikiTokenizer wikiTokenizer) {
183 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
184 // Skips wikilinks like: [[en::dick]]
187 if (langConfig.skipWikiLink(wikiTokenizer)) {
191 if (wikiTokenizer.wikiLinkDest() != null) {
192 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
194 linkDest = wikiTokenizer.wikiLinkText();
196 if (sectionEntryTypeName != null) {
197 // TODO: inside a definition, this could be the wrong language.
198 titleIndexBuilder.addEntryWithString(indexedEntry, wikiTokenizer.wikiLinkText(), sectionEntryTypeName);
200 if (linkDest != null) {
201 builder.append(String.format("<a href=\"%s\">", linkDest));
202 super.onWikiLink(wikiTokenizer);
203 builder.append(String.format("</a>"));
205 super.onWikiLink(wikiTokenizer);
210 public void onFunction(WikiTokenizer wikiTokenizer, String name,
211 List<String> args, Map<String, String> namedArgs) {
212 if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
213 namedArgs.remove("lang");
215 super.onFunction(wikiTokenizer, name, args, namedArgs);
219 public void onHtml(WikiTokenizer wikiTokenizer) {
220 super.onHtml(wikiTokenizer);
224 public void onNewline(WikiTokenizer wikiTokenizer) {
227 EntryTypeName sectionEntryTypeName;
230 public void onHeading(WikiTokenizer wikiTokenizer) {
231 final String headingText = wikiTokenizer.headingWikiText();
232 sectionEntryTypeName = langConfig.sectionNameToEntryType(headingText);
233 final int depth = wikiTokenizer.headingDepth();
234 if (langConfig.skipSection(headingText)) {
235 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
236 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
237 wikiTokenizer.returnToLineStart();
243 builder.append(String.format("\n<h%d>", depth));
244 dispatch(headingText, null);
245 builder.append(String.format("</h%d>\n", depth));
248 final List<Character> listPrefixStack = new ArrayList<Character>();
251 public void onListItem(WikiTokenizer wikiTokenizer) {
252 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
253 builder.append("\n");
255 final String prefix = wikiTokenizer.listItemPrefix();
256 while (listPrefixStack.size() < prefix.length()) {
257 builder.append(String.format("<%s>",
258 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
259 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
261 builder.append("<li>");
262 dispatch(wikiTokenizer.listItemWikiText(), null);
263 builder.append("</li>\n");
265 WikiTokenizer nextToken = wikiTokenizer.nextToken();
266 boolean returnToLineStart = false;
267 if (nextToken != null && nextToken.isNewline()) {
268 nextToken = nextToken.nextToken();
269 returnToLineStart = true;
271 final String nextListHeader;
272 if (nextToken == null || !nextToken.isListItem()) {
275 nextListHeader = nextToken.listItemPrefix();
277 if (returnToLineStart) {
278 wikiTokenizer.returnToLineStart();
280 while (listPrefixStack.size() > nextListHeader.length()) {
281 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
282 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
286 boolean boldOn = false;
287 boolean italicOn = false;
290 public void onMarkup(WikiTokenizer wikiTokenizer) {
291 if ("'''".equals(wikiTokenizer.token())) {
293 builder.append("<b>");
295 builder.append("</b>");
298 } else if ("''".equals(wikiTokenizer.token())) {
300 builder.append("<em>");
302 builder.append("</em>");
304 italicOn = !italicOn;