2 package com.hughes.android.dictionary.parser.wiktionary;
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import org.apache.commons.lang3.StringEscapeUtils;
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
16 import java.util.regex.Pattern;
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
20 interface LangConfig {
21 boolean skipSection(final String name);
22 boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
24 static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
26 final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
27 isoToLangConfig.put("EN", new LangConfig() {
29 public boolean skipSection(String headingText) {
30 return enSkipSections.matcher(headingText).matches();
34 public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
35 final String wikiText = wikiTokenizer.wikiLinkText();
36 if (wikiText.startsWith("Category:")) {
43 public static final String NAME = "WholeSectionToHtmlParser";
45 final IndexBuilder titleIndexBuilder;
46 final LangConfig langConfig;
48 public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso) {
49 this.titleIndexBuilder = titleIndexBuilder;
50 assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
51 this.langConfig = isoToLangConfig.get(wiktionaryIso);
55 void parseSection(String heading, String text) {
56 HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
57 IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
59 final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
62 callback.builder = new StringBuilder();
63 callback.indexedEntry = indexedEntry;
64 callback.dispatch(text, null);
66 htmlEntry.html = callback.builder.toString();
67 indexedEntry.isValid = true;
69 final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
71 htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
72 tokenData.htmlEntries.add(htmlEntry);
73 // titleIndexBuilder.addEntryWithString(indexedEntry, title,
74 // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
78 void removeUselessArgs(Map<String, String> namedArgs) {
81 class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
82 public AppendCallback(WholeSectionToHtmlParser parser) {
87 public void onPlainText(String plainText) {
88 super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
92 public void onWikiLink(WikiTokenizer wikiTokenizer) {
93 if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
94 // Skips wikilinks like: [[en::dick]]
97 if (langConfig.skipWikiLink(wikiTokenizer)) {
100 super.onWikiLink(wikiTokenizer);
104 public void onFunction(WikiTokenizer wikiTokenizer, String name,
105 List<String> args, Map<String, String> namedArgs) {
106 super.onFunction(wikiTokenizer, name, args, namedArgs);
110 public void onHtml(WikiTokenizer wikiTokenizer) {
111 super.onHtml(wikiTokenizer);
115 public void onNewline(WikiTokenizer wikiTokenizer) {
119 public void onHeading(WikiTokenizer wikiTokenizer) {
120 final String headingText = wikiTokenizer.headingWikiText();
121 final int depth = wikiTokenizer.headingDepth();
122 if (langConfig.skipSection(headingText)) {
123 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
124 if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
125 wikiTokenizer.returnToLineStart();
131 builder.append(String.format("\n<h%d>", depth));
132 dispatch(headingText, null);
133 builder.append(String.format("</h%d>\n", depth));
136 final List<Character> listPrefixStack = new ArrayList<Character>();
139 public void onListItem(WikiTokenizer wikiTokenizer) {
140 if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
141 builder.append("\n");
143 final String prefix = wikiTokenizer.listItemPrefix();
144 while (listPrefixStack.size() < prefix.length()) {
145 builder.append(String.format("<%s>",
146 WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
147 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
149 builder.append("<li>");
150 dispatch(wikiTokenizer.listItemWikiText(), null);
151 builder.append("</li>\n");
153 WikiTokenizer nextToken = wikiTokenizer.nextToken();
154 boolean returnToLineStart = false;
155 if (nextToken != null && nextToken.isNewline()) {
156 nextToken = nextToken.nextToken();
157 returnToLineStart = true;
159 final String nextListHeader;
160 if (nextToken == null || !nextToken.isListItem()) {
163 nextListHeader = nextToken.listItemPrefix();
165 if (returnToLineStart) {
166 wikiTokenizer.returnToLineStart();
168 while (listPrefixStack.size() > nextListHeader.length()) {
169 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
170 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
174 boolean boldOn = false;
175 boolean italicOn = false;
178 public void onMarkup(WikiTokenizer wikiTokenizer) {
179 if ("'''".equals(wikiTokenizer.token())) {
181 builder.append("<b>");
183 builder.append("</b>");
186 } else if ("''".equals(wikiTokenizer.token())) {
188 builder.append("<em>");
190 builder.append("</em>");
192 italicOn = !italicOn;