]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Put links into HtmlEntry.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.EntryTypeName;
5 import com.hughes.android.dictionary.engine.HtmlEntry;
6 import com.hughes.android.dictionary.engine.IndexBuilder;
7 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
8 import com.hughes.android.dictionary.engine.IndexedEntry;
9 import com.hughes.android.dictionary.parser.WikiTokenizer;
10 import com.hughes.util.StringUtil;
11
12 import org.apache.commons.lang3.StringEscapeUtils;
13
14 import java.util.ArrayList;
15 import java.util.LinkedHashMap;
16 import java.util.List;
17 import java.util.Map;
18 import java.util.regex.Pattern;
19
20 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
21
22     public static final String NAME = "WholeSectionToHtmlParser";
23
24     interface LangConfig {
25         boolean skipSection(final String name);
26         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
27         String adjustWikiLink(String wikiLinkDest);
28         void addFunctionCallbacks(
29                 Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks);
30     }
31     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
32     static {
33         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
34         isoToLangConfig.put("EN", new LangConfig() {
35             @Override
36             public boolean skipSection(String headingText) {
37                 return enSkipSections.matcher(headingText).matches();
38             }
39
40             @Override
41             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
42                 final String wikiText = wikiTokenizer.wikiLinkText();
43                 if (wikiText.startsWith("Category:")) {
44                     return true;
45                 }
46                 return false;
47             }
48             @Override
49             public String adjustWikiLink(String wikiLinkDest) {
50                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
51                     return null;
52                 }
53                 return wikiLinkDest;
54             }
55
56             @Override
57             public void addFunctionCallbacks(
58                     Map<String, FunctionCallback<WholeSectionToHtmlParser>> functionCallbacks) {
59                 EnFunctionCallbacks.addGenericCallbacks(functionCallbacks);
60             }});
61     }
62
63     final IndexBuilder titleIndexBuilder;
64     final String skipLangIso;
65     final LangConfig langConfig;
66
67     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
68         this.titleIndexBuilder = titleIndexBuilder;
69         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
70         this.langConfig = isoToLangConfig.get(wiktionaryIso);
71         this.skipLangIso = skipLangIso;
72     }
73     
74     IndexedEntry indexedEntry = null;
75
76     @Override
77     public void parseSection(String heading, String text) {
78         assert entrySource != null;
79         final HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
80         indexedEntry = new IndexedEntry(htmlEntry);
81
82         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
83                 this);
84         langConfig.addFunctionCallbacks(callback.functionCallbacks);
85
86         callback.builder = new StringBuilder();
87         callback.indexedEntry = indexedEntry;
88         callback.dispatch(text, null);
89
90         htmlEntry.html = callback.builder.toString();
91         indexedEntry.isValid = true;
92
93         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
94
95         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
96         tokenData.htmlEntries.add(htmlEntry);
97         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
98         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
99         
100         indexedEntry = null;
101     }
102
103     @Override
104     void removeUselessArgs(Map<String, String> namedArgs) {
105     }
106     
107     @Override
108     public void addLinkToCurrentEntry(String token, EntryTypeName entryTypeName) {
109         titleIndexBuilder.addEntryWithString(indexedEntry, token, entryTypeName);
110     }
111
112
113
114     static final Pattern ALL_ASCII = Pattern.compile("[\\p{ASCII}]*");
115
116     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
117         public AppendCallback(WholeSectionToHtmlParser parser) {
118             super(parser);
119         }
120
121         @Override
122         public void onPlainText(String plainText) {
123             final String htmlEscaped = StringEscapeUtils.escapeHtml3(plainText);
124             if (ALL_ASCII.matcher(htmlEscaped).matches()) {
125                 super.onPlainText(htmlEscaped);
126             } else { 
127                 super.onPlainText(StringUtil.escapeToPureHtmlUnicode(plainText));
128             }
129         }
130
131         @Override
132         public void onWikiLink(WikiTokenizer wikiTokenizer) {
133             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
134                 // Skips wikilinks like: [[en::dick]]
135                 return;
136             }
137             if (langConfig.skipWikiLink(wikiTokenizer)) {
138                 return;
139             }
140             String linkDest;
141             if (wikiTokenizer.wikiLinkDest() != null) {
142                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
143             } else {
144                 linkDest = wikiTokenizer.wikiLinkText();
145             }
146             if (linkDest != null) {
147                 builder.append(String.format("<a href=\"%s\">", linkDest));
148                 super.onWikiLink(wikiTokenizer);
149                 builder.append(String.format("</a>"));
150             } else {
151                 super.onWikiLink(wikiTokenizer);
152             }
153         }
154
155         @Override
156         public void onFunction(WikiTokenizer wikiTokenizer, String name,
157                 List<String> args, Map<String, String> namedArgs) {
158             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
159                 namedArgs.remove("lang");
160             }
161             super.onFunction(wikiTokenizer, name, args, namedArgs);
162         }
163
164         @Override
165         public void onHtml(WikiTokenizer wikiTokenizer) {
166             super.onHtml(wikiTokenizer);
167         }
168
169         @Override
170         public void onNewline(WikiTokenizer wikiTokenizer) {
171         }
172
173         @Override
174         public void onHeading(WikiTokenizer wikiTokenizer) {
175             final String headingText = wikiTokenizer.headingWikiText();
176             final int depth = wikiTokenizer.headingDepth();
177             if (langConfig.skipSection(headingText)) {
178                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
179                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
180                         wikiTokenizer.returnToLineStart();
181                         return;
182                     }
183                 }
184                 return;
185             }
186             builder.append(String.format("\n<h%d>", depth));
187             dispatch(headingText, null);
188             builder.append(String.format("</h%d>\n", depth));
189         }
190
191         final List<Character> listPrefixStack = new ArrayList<Character>();
192
193         @Override
194         public void onListItem(WikiTokenizer wikiTokenizer) {
195             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
196                 builder.append("\n");
197             }
198             final String prefix = wikiTokenizer.listItemPrefix();
199             while (listPrefixStack.size() < prefix.length()) {
200                 builder.append(String.format("<%s>",
201                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
202                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
203             }
204             builder.append("<li>");
205             dispatch(wikiTokenizer.listItemWikiText(), null);
206             builder.append("</li>\n");
207
208             WikiTokenizer nextToken = wikiTokenizer.nextToken();
209             boolean returnToLineStart = false;
210             if (nextToken != null && nextToken.isNewline()) {
211                 nextToken = nextToken.nextToken();
212                 returnToLineStart = true;
213             }
214             final String nextListHeader;
215             if (nextToken == null || !nextToken.isListItem()) {
216                 nextListHeader = "";
217             } else {
218                 nextListHeader = nextToken.listItemPrefix();
219             }
220             if (returnToLineStart) {
221                 wikiTokenizer.returnToLineStart();
222             }
223             while (listPrefixStack.size() > nextListHeader.length()) {
224                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
225                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
226             }
227         }
228
229         boolean boldOn = false;
230         boolean italicOn = false;
231
232         @Override
233         public void onMarkup(WikiTokenizer wikiTokenizer) {
234             if ("'''".equals(wikiTokenizer.token())) {
235                 if (!boldOn) {
236                     builder.append("<b>");
237                 } else {
238                     builder.append("</b>");
239                 }
240                 boldOn = !boldOn;
241             } else if ("''".equals(wikiTokenizer.token())) {
242                 if (!italicOn) {
243                     builder.append("<em>");
244                 } else {
245                     builder.append("</em>");
246                 }
247                 italicOn = !italicOn;
248             } else {
249                 assert false;
250             }
251         }
252
253     }
254
255 }