]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Skip lang=XX for the lang we care about.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
9
10 import org.apache.commons.lang3.StringEscapeUtils;
11
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.regex.Pattern;
17
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
19
20     public static final String NAME = "WholeSectionToHtmlParser";
21
22     interface LangConfig {
23         boolean skipSection(final String name);
24         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
25         String adjustWikiLink(String wikiLinkDest);
26     }
27     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
28     static {
29         final Pattern enSkipSections = Pattern.compile(".*Translations|Anagrams|References.*");
30         isoToLangConfig.put("EN", new LangConfig() {
31             @Override
32             public boolean skipSection(String headingText) {
33                 return enSkipSections.matcher(headingText).matches();
34             }
35
36             @Override
37             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
38                 final String wikiText = wikiTokenizer.wikiLinkText();
39                 if (wikiText.startsWith("Category:")) {
40                     return true;
41                 }
42                 return false;
43             }
44             @Override
45             public String adjustWikiLink(String wikiLinkDest) {
46                 if (wikiLinkDest.startsWith("w:") || wikiLinkDest.startsWith("Image:")) {
47                     return null;
48                 }
49                 return wikiLinkDest;
50             }});
51     }
52
53     final IndexBuilder titleIndexBuilder;
54     final String skipLangIso;
55     final LangConfig langConfig;
56
57     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso, final String skipLangIso) {
58         this.titleIndexBuilder = titleIndexBuilder;
59         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
60         this.langConfig = isoToLangConfig.get(wiktionaryIso);
61         this.skipLangIso = skipLangIso;
62     }
63
64     @Override
65     void parseSection(String heading, String text) {
66         HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
67         IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
68
69         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
70                 this);
71
72         callback.builder = new StringBuilder();
73         callback.indexedEntry = indexedEntry;
74         callback.dispatch(text, null);
75
76         htmlEntry.html = callback.builder.toString();
77         indexedEntry.isValid = true;
78
79         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
80
81         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
82         tokenData.htmlEntries.add(htmlEntry);
83         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
84         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
85     }
86
87     @Override
88     void removeUselessArgs(Map<String, String> namedArgs) {
89     }
90
91     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
92         public AppendCallback(WholeSectionToHtmlParser parser) {
93             super(parser);
94         }
95
96         @Override
97         public void onPlainText(String plainText) {
98             super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
99         }
100
101         @Override
102         public void onWikiLink(WikiTokenizer wikiTokenizer) {
103             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
104                 // Skips wikilinks like: [[en::dick]]
105                 return;
106             }
107             if (langConfig.skipWikiLink(wikiTokenizer)) {
108                 return;
109             }
110             String linkDest;
111             if (wikiTokenizer.wikiLinkDest() != null) {
112                 linkDest = langConfig.adjustWikiLink(wikiTokenizer.wikiLinkDest());
113             } else {
114                 linkDest = wikiTokenizer.wikiLinkText();
115             }
116             if (linkDest != null) {
117                 builder.append(String.format("<a href=\"%s\">", linkDest));
118                 super.onWikiLink(wikiTokenizer);
119                 builder.append(String.format("</a>"));
120             } else {
121                 super.onWikiLink(wikiTokenizer);
122             }
123         }
124
125         @Override
126         public void onFunction(WikiTokenizer wikiTokenizer, String name,
127                 List<String> args, Map<String, String> namedArgs) {
128             if (skipLangIso.equalsIgnoreCase(namedArgs.get("lang"))) {
129                 namedArgs.remove("lang");
130             }
131             super.onFunction(wikiTokenizer, name, args, namedArgs);
132         }
133
134         @Override
135         public void onHtml(WikiTokenizer wikiTokenizer) {
136             super.onHtml(wikiTokenizer);
137         }
138
139         @Override
140         public void onNewline(WikiTokenizer wikiTokenizer) {
141         }
142
143         @Override
144         public void onHeading(WikiTokenizer wikiTokenizer) {
145             final String headingText = wikiTokenizer.headingWikiText();
146             final int depth = wikiTokenizer.headingDepth();
147             if (langConfig.skipSection(headingText)) {
148                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
149                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
150                         wikiTokenizer.returnToLineStart();
151                         return;
152                     }
153                 }
154                 return;
155             }
156             builder.append(String.format("\n<h%d>", depth));
157             dispatch(headingText, null);
158             builder.append(String.format("</h%d>\n", depth));
159         }
160
161         final List<Character> listPrefixStack = new ArrayList<Character>();
162
163         @Override
164         public void onListItem(WikiTokenizer wikiTokenizer) {
165             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
166                 builder.append("\n");
167             }
168             final String prefix = wikiTokenizer.listItemPrefix();
169             while (listPrefixStack.size() < prefix.length()) {
170                 builder.append(String.format("<%s>",
171                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
172                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
173             }
174             builder.append("<li>");
175             dispatch(wikiTokenizer.listItemWikiText(), null);
176             builder.append("</li>\n");
177
178             WikiTokenizer nextToken = wikiTokenizer.nextToken();
179             boolean returnToLineStart = false;
180             if (nextToken != null && nextToken.isNewline()) {
181                 nextToken = nextToken.nextToken();
182                 returnToLineStart = true;
183             }
184             final String nextListHeader;
185             if (nextToken == null || !nextToken.isListItem()) {
186                 nextListHeader = "";
187             } else {
188                 nextListHeader = nextToken.listItemPrefix();
189             }
190             if (returnToLineStart) {
191                 wikiTokenizer.returnToLineStart();
192             }
193             while (listPrefixStack.size() > nextListHeader.length()) {
194                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
195                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
196             }
197         }
198
199         boolean boldOn = false;
200         boolean italicOn = false;
201
202         @Override
203         public void onMarkup(WikiTokenizer wikiTokenizer) {
204             if ("'''".equals(wikiTokenizer.token())) {
205                 if (!boldOn) {
206                     builder.append("<b>");
207                 } else {
208                     builder.append("</b>");
209                 }
210                 boldOn = !boldOn;
211             } else if ("''".equals(wikiTokenizer.token())) {
212                 if (!italicOn) {
213                     builder.append("<em>");
214                 } else {
215                     builder.append("</em>");
216                 }
217                 italicOn = !italicOn;
218             } else {
219                 assert false;
220             }
221         }
222
223     }
224
225 }