]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Basic general functions in WholeSectionParser.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
9
10 import org.apache.commons.lang3.StringEscapeUtils;
11
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.regex.Pattern;
17
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
19
20     public static final String NAME = "WholeSectionToHtmlParser";
21
22     interface LangConfig {
23         boolean skipSection(final String name);
24         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
25         String adjustWikiLink(String wikiLinkDest);
26         void addFunctionCallbacks(
27                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
28     }
29     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
30     static {
31         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
32         isoToLangConfig.put("EN", new LangConfig() {
33             @Override
34             public boolean skipSection(String headingText) {
35                 return enSkipSections.matcher(headingText).matches();
36             }
37
38             @Override
39             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
40                 final String wikiText = wikiTokenizer.wikiLinkText();
41                 if (wikiText.startsWith("Category:")) {
42                     return true;
43                 }
44                 return false;
45             }
46             @Override
47             public String adjustWikiLink(String wikiLinkDest) {
48                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
49                     return null;
50                 }
51                 return wikiLinkDest;
52             }
53
54             @Override
55             public void addFunctionCallbacks(
56                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
57                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
58             }});
59     }
60
61     final IndexBuilder titleIndexBuilder;
62     final String skipLangIso;
63     final LangConfig langConfig;
64
65     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
66         this.titleIndexBuilder = titleIndexBuilder;
67         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
68         this.langConfig = isoToLangConfig.get(wiktionaryIso);
69         this.skipLangIso = skipLangIso;
70     }
71
72     @Override
73     void parseSection(String heading, String text) {
74         HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
75         IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
76
77         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
78                 this);
79         langConfig.addFunctionCallbacks(callback.functionCallbacks);
80
81         callback.builder = new StringBuilder();
82         callback.indexedEntry = indexedEntry;
83         callback.dispatch(text, null);
84
85         htmlEntry.html = callback.builder.toString();
86         indexedEntry.isValid = true;
87
88         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
89
90         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
91         tokenData.htmlEntries.add(htmlEntry);
92         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
93         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
94     }
95
96     @Override
97     void removeUselessArgs(Map<String, String> namedArgs) {
98     }
99
100     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
101         public AppendCallback(WholeSectionToHtmlParser parser) {
102             super(parser);
103         }
104
105         @Override
106         public void onPlainText(String plainText) {
107             super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
108         }
109
110         @Override
111         public void onWikiLink(WikiTokenizer wikiTokenizer) {
112             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
113                 // Skips wikilinks like: [[en::dick]]
114                 return;
115             }
116             if (langConfig.skipWikiLink(wikiTokenizer)) {
117                 return;
118             }
119             String linkDest;
120             if (wikiTokenizer.wikiLinkDest() != null) {
121                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
122             } else {
123                 linkDest = wikiTokenizer.wikiLinkText();
124             }
125             if (linkDest != null) {
126                 builder.append(String.format("<a href=\"%s\">", linkDest));
127                 super.onWikiLink(wikiTokenizer);
128                 builder.append(String.format("</a>"));
129             } else {
130                 super.onWikiLink(wikiTokenizer);
131             }
132         }
133
134         @Override
135         public void onFunction(WikiTokenizer wikiTokenizer, String name,
136                 List<String> args, Map<String, String> namedArgs) {
137             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
138                 namedArgs.remove("lang");
139             }
140             super.onFunction(wikiTokenizer, name, args, namedArgs);
141         }
142
143         @Override
144         public void onHtml(WikiTokenizer wikiTokenizer) {
145             super.onHtml(wikiTokenizer);
146         }
147
148         @Override
149         public void onNewline(WikiTokenizer wikiTokenizer) {
150         }
151
152         @Override
153         public void onHeading(WikiTokenizer wikiTokenizer) {
154             final String headingText = wikiTokenizer.headingWikiText();
155             final int depth = wikiTokenizer.headingDepth();
156             if (langConfig.skipSection(headingText)) {
157                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
158                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
159                         wikiTokenizer.returnToLineStart();
160                         return;
161                     }
162                 }
163                 return;
164             }
165             builder.append(String.format("\n<h%d>", depth));
166             dispatch(headingText, null);
167             builder.append(String.format("</h%d>\n", depth));
168         }
169
170         final List<Character> listPrefixStack = new ArrayList<Character>();
171
172         @Override
173         public void onListItem(WikiTokenizer wikiTokenizer) {
174             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
175                 builder.append("\n");
176             }
177             final String prefix = wikiTokenizer.listItemPrefix();
178             while (listPrefixStack.size() < prefix.length()) {
179                 builder.append(String.format("<%s>",
180                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
181                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
182             }
183             builder.append("<li>");
184             dispatch(wikiTokenizer.listItemWikiText(), null);
185             builder.append("</li>\n");
186
187             WikiTokenizer nextToken = wikiTokenizer.nextToken();
188             boolean returnToLineStart = false;
189             if (nextToken != null && nextToken.isNewline()) {
190                 nextToken = nextToken.nextToken();
191                 returnToLineStart = true;
192             }
193             final String nextListHeader;
194             if (nextToken == null || !nextToken.isListItem()) {
195                 nextListHeader = "";
196             } else {
197                 nextListHeader = nextToken.listItemPrefix();
198             }
199             if (returnToLineStart) {
200                 wikiTokenizer.returnToLineStart();
201             }
202             while (listPrefixStack.size() > nextListHeader.length()) {
203                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
204                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
205             }
206         }
207
208         boolean boldOn = false;
209         boolean italicOn = false;
210
211         @Override
212         public void onMarkup(WikiTokenizer wikiTokenizer) {
213             if ("'''".equals(wikiTokenizer.token())) {
214                 if (!boldOn) {
215                     builder.append("<b>");
216                 } else {
217                     builder.append("</b>");
218                 }
219                 boldOn = !boldOn;
220             } else if ("''".equals(wikiTokenizer.token())) {
221                 if (!italicOn) {
222                     builder.append("<em>");
223                 } else {
224                     builder.append("</em>");
225                 }
226                 italicOn = !italicOn;
227             } else {
228                 assert false;
229             }
230         }
231
232     }
233
234 }