]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/parser/wiktionary/WholeSectionToHtmlParser.java
Got rid of Category:.
[DictionaryPC.git] / src / com / hughes / android / dictionary / parser / wiktionary / WholeSectionToHtmlParser.java
1
2 package com.hughes.android.dictionary.parser.wiktionary;
3
4 import com.hughes.android.dictionary.engine.HtmlEntry;
5 import com.hughes.android.dictionary.engine.IndexBuilder;
6 import com.hughes.android.dictionary.engine.IndexBuilder.TokenData;
7 import com.hughes.android.dictionary.engine.IndexedEntry;
8 import com.hughes.android.dictionary.parser.WikiTokenizer;
9
10 import org.apache.commons.lang3.StringEscapeUtils;
11
12 import java.util.ArrayList;
13 import java.util.LinkedHashMap;
14 import java.util.List;
15 import java.util.Map;
16 import java.util.regex.Pattern;
17
18 public class WholeSectionToHtmlParser extends AbstractWiktionaryParser {
19     
20     interface LangConfig {
21         boolean skipSection(final String name);
22         boolean skipWikiLink(final WikiTokenizer wikiTokenizer);
23     }
24     static final Map<String,LangConfig> isoToLangConfig = new LinkedHashMap<String,LangConfig>();
25     static {
26         final Pattern enSkipSections = Pattern.compile(".*Translations.*");
27         isoToLangConfig.put("EN", new LangConfig() {
28             @Override
29             public boolean skipSection(String headingText) {
30                 return enSkipSections.matcher(headingText).matches();
31             }
32
33             @Override
34             public boolean skipWikiLink(WikiTokenizer wikiTokenizer) {
35                 final String wikiText = wikiTokenizer.wikiLinkText();
36                 if (wikiText.startsWith("Category:")) {
37                     return true;
38                 }
39                 return false;
40             }});
41     }
42
43     public static final String NAME = "WholeSectionToHtmlParser";
44
45     final IndexBuilder titleIndexBuilder;
46     final LangConfig langConfig;
47
48     public WholeSectionToHtmlParser(final IndexBuilder titleIndexBuilder, final String wiktionaryIso) {
49         this.titleIndexBuilder = titleIndexBuilder;
50         assert isoToLangConfig.containsKey(wiktionaryIso): wiktionaryIso;
51         this.langConfig = isoToLangConfig.get(wiktionaryIso);
52     }
53
54     @Override
55     void parseSection(String heading, String text) {
56         HtmlEntry htmlEntry = new HtmlEntry(entrySource, StringEscapeUtils.escapeHtml3(title));
57         IndexedEntry indexedEntry = new IndexedEntry(htmlEntry);
58
59         final AppendAndIndexWikiCallback<WholeSectionToHtmlParser> callback = new AppendCallback(
60                 this);
61
62         callback.builder = new StringBuilder();
63         callback.indexedEntry = indexedEntry;
64         callback.dispatch(text, null);
65
66         htmlEntry.html = callback.builder.toString();
67         indexedEntry.isValid = true;
68
69         final TokenData tokenData = titleIndexBuilder.getOrCreateTokenData(title);
70
71         htmlEntry.addToDictionary(titleIndexBuilder.index.dict);
72         tokenData.htmlEntries.add(htmlEntry);
73         // titleIndexBuilder.addEntryWithString(indexedEntry, title,
74         // EntryTypeName.WIKTIONARY_TITLE_MULTI_DETAIL);
75     }
76
77     @Override
78     void removeUselessArgs(Map<String, String> namedArgs) {
79     }
80
81     class AppendCallback extends AppendAndIndexWikiCallback<WholeSectionToHtmlParser> {
82         public AppendCallback(WholeSectionToHtmlParser parser) {
83             super(parser);
84         }
85
86         @Override
87         public void onPlainText(String plainText) {
88             super.onPlainText(StringEscapeUtils.escapeHtml3(plainText));
89         }
90
91         @Override
92         public void onWikiLink(WikiTokenizer wikiTokenizer) {
93             if (wikiTokenizer.wikiLinkText().endsWith(":" + title)) {
94                 // Skips wikilinks like: [[en::dick]]
95                 return;
96             }
97             if (langConfig.skipWikiLink(wikiTokenizer)) {
98                 return;
99             }
100             super.onWikiLink(wikiTokenizer);
101         }
102
103         @Override
104         public void onFunction(WikiTokenizer wikiTokenizer, String name,
105                 List<String> args, Map<String, String> namedArgs) {
106             super.onFunction(wikiTokenizer, name, args, namedArgs);
107         }
108
109         @Override
110         public void onHtml(WikiTokenizer wikiTokenizer) {
111             super.onHtml(wikiTokenizer);
112         }
113
114         @Override
115         public void onNewline(WikiTokenizer wikiTokenizer) {
116         }
117
118         @Override
119         public void onHeading(WikiTokenizer wikiTokenizer) {
120             final String headingText = wikiTokenizer.headingWikiText();
121             final int depth = wikiTokenizer.headingDepth();
122             if (langConfig.skipSection(headingText)) {
123                 while ((wikiTokenizer = wikiTokenizer.nextToken()) != null) {
124                     if (wikiTokenizer.isHeading() && wikiTokenizer.headingDepth() <= depth) {
125                         wikiTokenizer.returnToLineStart();
126                         return;
127                     }
128                 }
129                 return;
130             }
131             builder.append(String.format("\n<h%d>", depth));
132             dispatch(headingText, null);
133             builder.append(String.format("</h%d>\n", depth));
134         }
135
136         final List<Character> listPrefixStack = new ArrayList<Character>();
137
138         @Override
139         public void onListItem(WikiTokenizer wikiTokenizer) {
140             if (builder.length() != 0 && builder.charAt(builder.length() - 1) != '\n') {
141                 builder.append("\n");
142             }
143             final String prefix = wikiTokenizer.listItemPrefix();
144             while (listPrefixStack.size() < prefix.length()) {
145                 builder.append(String.format("<%s>",
146                         WikiTokenizer.getListTag(prefix.charAt(listPrefixStack.size()))));
147                 listPrefixStack.add(prefix.charAt(listPrefixStack.size()));
148             }
149             builder.append("<li>");
150             dispatch(wikiTokenizer.listItemWikiText(), null);
151             builder.append("</li>\n");
152
153             WikiTokenizer nextToken = wikiTokenizer.nextToken();
154             boolean returnToLineStart = false;
155             if (nextToken != null && nextToken.isNewline()) {
156                 nextToken = nextToken.nextToken();
157                 returnToLineStart = true;
158             }
159             final String nextListHeader;
160             if (nextToken == null || !nextToken.isListItem()) {
161                 nextListHeader = "";
162             } else {
163                 nextListHeader = nextToken.listItemPrefix();
164             }
165             if (returnToLineStart) {
166                 wikiTokenizer.returnToLineStart();
167             }
168             while (listPrefixStack.size() > nextListHeader.length()) {
169                 final char prefixChar = listPrefixStack.remove(listPrefixStack.size() - 1);
170                 builder.append(String.format("</%s>\n", WikiTokenizer.getListTag(prefixChar)));
171             }
172         }
173
174         boolean boldOn = false;
175         boolean italicOn = false;
176
177         @Override
178         public void onMarkup(WikiTokenizer wikiTokenizer) {
179             if ("'''".equals(wikiTokenizer.token())) {
180                 if (!boldOn) {
181                     builder.append("<b>");
182                 } else {
183                     builder.append("</b>");
184                 }
185                 boldOn = !boldOn;
186             } else if ("''".equals(wikiTokenizer.token())) {
187                 if (!italicOn) {
188                     builder.append("<em>");
189                 } else {
190                     builder.append("</em>");
191                 }
192                 italicOn = !italicOn;
193             } else {
194                 assert false;
195             }
196         }
197
198     }
199
200 }