1 package com.hughes.android.dictionary;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Collections;
7 import java.util.LinkedHashMap;
10 import java.util.TreeMap;
11 import java.util.concurrent.atomic.AtomicInteger;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
15 import javax.xml.parsers.ParserConfigurationException;
16 import javax.xml.parsers.SAXParser;
17 import javax.xml.parsers.SAXParserFactory;
19 import org.xml.sax.Attributes;
20 import org.xml.sax.SAXException;
22 import com.hughes.util.MapUtil;
23 import com.hughes.util.StringUtil;
25 public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
27 final Dictionary dict;
29 StringBuilder titleBuilder;
30 StringBuilder textBuilder;
31 StringBuilder currentBuilder = null;
33 public WiktionaryXmlParser(final Dictionary dict) {
38 public void startElement(String uri, String localName, String qName,
39 Attributes attributes) {
40 currentBuilder = null;
41 if ("page".equals(qName)) {
42 titleBuilder = new StringBuilder();
43 textBuilder = new StringBuilder();
44 } else if ("title".equals(qName)) {
45 currentBuilder = titleBuilder;
46 } else if ("text".equals(qName)) {
47 currentBuilder = textBuilder;
52 public void characters(char[] ch, int start, int length) throws SAXException {
53 if (currentBuilder != null) {
54 currentBuilder.append(ch, start, length);
59 public void endElement(String uri, String localName, String qName)
61 currentBuilder = null;
62 if ("page".equals(qName)) {
67 private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
70 private static final Pattern SECTION_HEADER = Pattern
71 .compile("=== *\\{\\{Wortart\\|");
73 private static final Pattern WORTART_DELIM = Pattern.compile("===",
75 private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
77 private static final Pattern WIKI_QUOTE = Pattern.compile("''",
79 private static final Pattern WIKI_DOUBLE_BRACE = Pattern
80 .compile("\\{\\{([^}]+)\\}\\}");
81 private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
82 .compile("\\[\\[([^\\]]+)\\]\\]");
83 private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=");
86 Wortart("Wortart", null), Aussprache("Aussprache", null), Bedeutungen(
87 "Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), Synonome(
88 "Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), Gegenworte(
89 "Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), Oberbegriffe(
90 "Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), Unterbegriffe(
91 "Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), Beispiele(
92 "Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), Redewendungen(
93 "Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), CharakteristischeWortkombinationen(
94 "Charakteristische Wortkombinationen", Pattern
95 .compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), AbgeleiteteBegriffe(
96 "Abgeleitete Begriffe", Pattern
97 .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), Herkunft("Herkunft",
98 Pattern.compile("\\{\\{Herkunft\\}\\}"));
101 final Pattern listPattern;
103 Field(final String name, final Pattern listPattern) {
105 this.listPattern = listPattern;
109 private static final Pattern WORTART = Pattern
110 .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
111 private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
114 private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
116 private void endPage() {
118 StringBuilder text = textBuilder;
119 text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
120 text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
123 StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
124 Pattern.compile("-->", Pattern.LITERAL));
126 String sectionString;
127 while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
128 SECTION_HEADER, false)) != null) {
129 final StringBuilder section = new StringBuilder(sectionString);
131 String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
133 if (wortart.contains("\n") || !wortart.contains("eutsch")) {
134 MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
135 AtomicInteger.class).incrementAndGet();
139 final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
141 wortart = wortart.replaceAll("===", "");
142 wortart = WORTART.matcher(wortart).replaceAll("$1");
143 wortart = GENDER.matcher(wortart).replaceAll("{$1}");
144 wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
145 wortart = wortart.replaceAll("Wortart\\|", "");
146 wortart = wortart.trim();
147 fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
149 String aussprache = StringUtil
150 .remove(section, AUSSPRACHE, NEWLINE, false);
151 if (aussprache != null) {
152 aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
153 aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
154 aussprache = aussprache.replaceAll("Lautschrift\\|", "");
155 aussprache = aussprache.trim();
156 fieldToValue.put(Field.Aussprache, Collections
157 .singletonList(aussprache));
160 for (final Field field : Field.values()) {
161 if (field.listPattern != null) {
162 fieldToValue.put(field, extractList(section, field.listPattern));
166 System.out.println(titleBuilder);
167 for (final Field field : Field.values()) {
168 if (fieldToValue.get(field).isEmpty()) {
169 fieldToValue.remove(field);
171 System.out.println(field.name);
172 for (final String line : fieldToValue.get(field)) {
173 System.out.println(" " + line);
177 System.out.println("WHAT'S LEFT:");
178 System.out.println(section);
179 System.out.println("------------------------------------------------");
183 // System.out.println(titleBuilder);
185 * final List<String> pronunciations = new ArrayList<String>(); final
186 * CharSequence pronunciationSeq = getSection(text, PRONUNCIATION,
187 * SECTION_START); if (pronunciationSeq != null) { final Matcher
188 * pronunciationMatcher = PRONUNCIATION_EXAMPLE.matcher(pronunciationSeq);
189 * while (pronunciationMatcher.find()) {
190 * pronunciations.add(pronunciationMatcher.group(1)); }
191 * System.out.println("PRONUNCIATIONS:" + pronunciations); }
193 * String[] meanings = null; final CharSequence meaningsSeq =
194 * getSection(text, MEANINGS, SECTION_START); if (meaningsSeq != null) {
195 * meanings = LIST.split(meaningsSeq); meanings[0] = "";
196 * System.out.println("MEANINGS:" + Arrays.toString(meanings)); }
198 * System.out.println(text);
203 private List<String> extractList(final StringBuilder section,
204 final Pattern start) {
205 final List<String> result = new ArrayList<String>();
206 final String linesString = StringUtil.remove(section, start,
207 WIKI_DOUBLE_BRACE, false);
208 if (linesString != null) {
209 String[] lines = linesString.split("\n");
210 for (int i = 1; i < lines.length; ++i) {
211 String bedeutung = lines[i];
212 bedeutung = bedeutung.replaceFirst("^:", "");
213 bedeutung = bedeutung.trim();
214 if (bedeutung.length() > 0) {
215 result.add(bedeutung);
222 private static CharSequence getSection(CharSequence input, Pattern start,
224 Matcher startMatcher = start.matcher(input);
225 if (!startMatcher.find()) {
228 Matcher endMatcher = end.matcher(input);
229 if (!endMatcher.find(startMatcher.end())) {
230 return input.subSequence(startMatcher.start(), input.length());
232 return input.subSequence(startMatcher.start(), endMatcher.start());
235 void parse(final File file) throws ParserConfigurationException,
236 SAXException, IOException {
237 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
238 parser.parse(file, this);
239 System.out.println(errorCounts);