]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Skip w: and Image: wikiLinks.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
9
10 import org.apache.commons.lang3.StringEscapeUtils;
11
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.regex.Pattern;
17
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
19     
20     interface LangConfig {
21         boolean skipSection(final String name);
22         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
23         String adjustWikiLink(String wikiLinkDest);
24     }
25     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
26     static {
27         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
28         isoToLangConfig.put("EN", new LangConfig() {
29             @Override
30             public boolean skipSection(String headingText) {
31                 return enSkipSections.matcher(headingText).matches();
32             }
33
34             @Override
35             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
36                 final String wikiText = wikiTokenizer.wikiLinkText();
37                 if (wikiText.startsWith("Category:")) {
38                     return true;
39                 }
40                 return false;
41             }
42             @Override
43             public String adjustWikiLink(String wikiLinkDest) {
44                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
45                     return null;
46                 }
47                 return wikiLinkDest;
48             }});
49     }
50
51     public static final String NAME = "WholeSectionToHtmlParser";
52
53     final IndexBuilder titleIndexBuilder;
54     final LangConfig langConfig;
55
56     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso) {
57         this.titleIndexBuilder = titleIndexBuilder;
58         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
59         this.langConfig = isoToLangConfig.get(wiktionaryIso);
60     }
61
62     @Override
63     void parseSection(String heading, String text) {
64         HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
65         IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
66
67         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
68                 this);
69
70         callback.builder = new StringBuilder();
71         callback.indexedEntry = indexedEntry;
72         callback.dispatch(text, null);
73
74         htmlEntry.html = callback.builder.toString();
75         indexedEntry.isValid = true;
76
77         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
78
79         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
80         tokenData.htmlEntries.add(htmlEntry);
81         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
82         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
83     }
84
85     @Override
86     void removeUselessArgs(Map<String, String> namedArgs) {
87     }
88
89     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
90         public AppendCallback(WholeSectionToHtmlParser parser) {
91             super(parser);
92         }
93
94         @Override
95         public void onPlainText(String plainText) {
96             super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
97         }
98
99         @Override
100         public void onWikiLink(WikiTokenizer wikiTokenizer) {
101             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
102                 // Skips wikilinks like: [[en::dick]]
103                 return;
104             }
105             if (langConfig.skipWikiLink(wikiTokenizer)) {
106                 return;
107             }
108             String linkDest;
109             if (wikiTokenizer.wikiLinkDest() != null) {
110                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
111             } else {
112                 linkDest = wikiTokenizer.wikiLinkText();
113             }
114             if (linkDest != null) {
115                 builder.append(String.format("<a href=\"%s\">", linkDest));
116                 super.onWikiLink(wikiTokenizer);
117                 builder.append(String.format("</a>"));
118             } else {
119                 super.onWikiLink(wikiTokenizer);
120             }
121         }
122
123         @Override
124         public void onFunction(WikiTokenizer wikiTokenizer, String name,
125                 List<String> args, Map<String, String> namedArgs) {
126             super.onFunction(wikiTokenizer, name, args, namedArgs);
127         }
128
129         @Override
130         public void onHtml(WikiTokenizer wikiTokenizer) {
131             super.onHtml(wikiTokenizer);
132         }
133
134         @Override
135         public void onNewline(WikiTokenizer wikiTokenizer) {
136         }
137
138         @Override
139         public void onHeading(WikiTokenizer wikiTokenizer) {
140             final String headingText = wikiTokenizer.headingWikiText();
141             final int depth = wikiTokenizer.headingDepth();
142             if (langConfig.skipSection(headingText)) {
143                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
144                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
145                         wikiTokenizer.returnToLineStart();
146                         return;
147                     }
148                 }
149                 return;
150             }
151             builder.append(String.format("\n<h%d>", depth));
152             dispatch(headingText, null);
153             builder.append(String.format("</h%d>\n", depth));
154         }
155
156         final List<Character> listPrefixStack = new ArrayList<Character>();
157
158         @Override
159         public void onListItem(WikiTokenizer wikiTokenizer) {
160             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
161                 builder.append("\n");
162             }
163             final String prefix = wikiTokenizer.listItemPrefix();
164             while (listPrefixStack.size() < prefix.length()) {
165                 builder.append(String.format("<%s>",
166                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
167                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
168             }
169             builder.append("<li>");
170             dispatch(wikiTokenizer.listItemWikiText(), null);
171             builder.append("</li>\n");
172
173             WikiTokenizer nextToken = wikiTokenizer.nextToken();
174             boolean returnToLineStart = false;
175             if (nextToken != null && nextToken.isNewline()) {
176                 nextToken = nextToken.nextToken();
177                 returnToLineStart = true;
178             }
179             final String nextListHeader;
180             if (nextToken == null || !nextToken.isListItem()) {
181                 nextListHeader = "";
182             } else {
183                 nextListHeader = nextToken.listItemPrefix();
184             }
185             if (returnToLineStart) {
186                 wikiTokenizer.returnToLineStart();
187             }
188             while (listPrefixStack.size() > nextListHeader.length()) {
189                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
190                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
191             }
192         }
193
194         boolean boldOn = false;
195         boolean italicOn = false;
196
197         @Override
198         public void onMarkup(WikiTokenizer wikiTokenizer) {
199             if ("'''".equals(wikiTokenizer.token())) {
200                 if (!boldOn) {
201                     builder.append("<b>");
202                 } else {
203                     builder.append("</b>");
204                 }
205                 boldOn = !boldOn;
206             } else if ("''".equals(wikiTokenizer.token())) {
207                 if (!italicOn) {
208                     builder.append("<em>");
209                 } else {
210                     builder.append("</em>");
211                 }
212                 italicOn = !italicOn;
213             } else {
214                 assert false;
215             }
216         }
217
218     }
219
220 }