1 package com.hughes.android.dictionary;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Collections;
7 import java.util.LinkedHashMap;
10 import java.util.TreeMap;
11 import java.util.concurrent.atomic.AtomicInteger;
12 import java.util.regex.Pattern;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.parsers.SAXParser;
16 import javax.xml.parsers.SAXParserFactory;
18 import org.xml.sax.Attributes;
19 import org.xml.sax.SAXException;
21 import com.hughes.util.MapUtil;
22 import com.hughes.util.StringUtil;
24 public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
26 final Dictionary dict;
28 StringBuilder titleBuilder;
29 StringBuilder textBuilder;
30 StringBuilder currentBuilder = null;
32 public WiktionaryXmlParser(final Dictionary dict) {
37 public void startElement(String uri, String localName, String qName,
38 Attributes attributes) {
39 currentBuilder = null;
40 if ("page".equals(qName)) {
41 titleBuilder = new StringBuilder();
42 textBuilder = new StringBuilder();
43 } else if ("title".equals(qName)) {
44 currentBuilder = titleBuilder;
45 } else if ("text".equals(qName)) {
46 currentBuilder = textBuilder;
51 public void characters(char[] ch, int start, int length) throws SAXException {
52 if (currentBuilder != null) {
53 currentBuilder.append(ch, start, length);
58 public void endElement(String uri, String localName, String qName)
60 currentBuilder = null;
61 if ("page".equals(qName)) {
66 private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
69 private static final Pattern SECTION_HEADER = Pattern
70 .compile("=== *\\{\\{Wortart\\|");
72 private static final Pattern WORTART_DELIM = Pattern.compile("===",
74 private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
76 private static final Pattern WIKI_QUOTE = Pattern.compile("''",
78 private static final Pattern WIKI_DOUBLE_BRACE = Pattern
79 .compile("\\{\\{([^}]+)\\}\\}");
80 private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
81 .compile("\\[\\[([^\\]]+)\\]\\]");
82 private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE);
85 Wortart("Wortart", null),
87 Aussprache("Aussprache", null),
89 Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")),
91 Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")),
93 Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")),
95 Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")),
97 Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")),
99 Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")),
101 Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")),
103 Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")),
105 CharakteristischeWortkombinationen("Charakteristische Wortkombinationen",
106 Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")),
108 AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern
109 .compile("\\{\\{Abgeleitete Begriffe\\}\\}")),
111 Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")),
113 Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")),
118 final Pattern listPattern;
120 Field(final String name, final Pattern listPattern) {
122 this.listPattern = listPattern;
126 private static final Pattern WORTART = Pattern
127 .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
128 private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
131 private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
133 private void endPage() {
135 StringBuilder text = textBuilder;
136 text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
137 text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
140 StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
141 Pattern.compile("-->", Pattern.LITERAL));
143 String sectionString;
144 while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
145 SECTION_HEADER, false)) != null) {
146 final StringBuilder section = new StringBuilder(sectionString);
148 String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
150 if (wortart.contains("\n") || !wortart.contains("eutsch")) {
151 MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
152 AtomicInteger.class).incrementAndGet();
156 final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
158 wortart = wortart.replaceAll("===", "");
159 wortart = WORTART.matcher(wortart).replaceAll("$1");
160 wortart = GENDER.matcher(wortart).replaceAll("{$1}");
161 wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
162 wortart = wortart.replaceAll("Wortart\\|", "");
163 wortart = wortart.trim();
164 fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
166 String aussprache = StringUtil
167 .remove(section, AUSSPRACHE, NEWLINE, false);
168 if (aussprache != null) {
169 aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
170 aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
171 aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", "");
172 aussprache = aussprache.trim();
173 fieldToValue.put(Field.Aussprache, Collections
174 .singletonList(aussprache));
177 for (final Field field : Field.values()) {
178 if (field.listPattern != null) {
179 fieldToValue.put(field, extractList(section, field.listPattern));
183 System.out.println(titleBuilder);
184 for (final Field field : Field.values()) {
185 if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) {
186 fieldToValue.remove(field);
188 if (field.name != null) {
189 // System.out.println(field.name);
190 // for (final String line : fieldToValue.get(field)) {
191 // System.out.println(" " + line);
196 // System.out.println("WHAT'S LEFT:");
197 // System.out.println(section);
198 // System.out.println("------------------------------------------------");
204 private List<String> extractList(final StringBuilder section,
205 final Pattern start) {
206 final List<String> result = new ArrayList<String>();
207 final String linesString = StringUtil.remove(section, start,
208 WIKI_NEW_SECTION, false);
209 if (linesString != null) {
210 String[] lines = linesString.split("\n");
211 for (int i = 1; i < lines.length; ++i) {
212 String bedeutung = lines[i];
213 bedeutung = bedeutung.replaceFirst("^:+", "");
214 bedeutung = bedeutung.trim();
215 if (bedeutung.length() > 0) {
216 result.add(bedeutung);
223 void parse(final File file) throws ParserConfigurationException,
224 SAXException, IOException {
225 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
226 parser.parse(file, this);
227 System.out.println(errorCounts);