1 package com.hughes.android.dictionary;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.Collections;
7 import java.util.LinkedHashMap;
10 import java.util.TreeMap;
11 import java.util.concurrent.atomic.AtomicInteger;
12 import java.util.regex.Pattern;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.parsers.SAXParser;
16 import javax.xml.parsers.SAXParserFactory;
18 import org.xml.sax.Attributes;
19 import org.xml.sax.SAXException;
21 import com.hughes.android.dictionary.engine.Dictionary;
22 import com.hughes.util.MapUtil;
23 import com.hughes.util.StringUtil;
25 public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
27 final Dictionary dict;
29 StringBuilder titleBuilder;
30 StringBuilder textBuilder;
31 StringBuilder currentBuilder = null;
33 public WiktionaryXmlParser(final Dictionary dict) {
38 public void startElement(String uri, String localName, String qName,
39 Attributes attributes) {
40 currentBuilder = null;
41 if ("page".equals(qName)) {
42 titleBuilder = new StringBuilder();
43 textBuilder = new StringBuilder();
44 } else if ("title".equals(qName)) {
45 currentBuilder = titleBuilder;
46 } else if ("text".equals(qName)) {
47 currentBuilder = textBuilder;
52 public void characters(char[] ch, int start, int length) throws SAXException {
53 if (currentBuilder != null) {
54 currentBuilder.append(ch, start, length);
59 public void endElement(String uri, String localName, String qName)
61 currentBuilder = null;
62 if ("page".equals(qName)) {
67 private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
70 private static final Pattern SECTION_HEADER = Pattern
71 .compile("=== *\\{\\{Wortart\\|");
73 private static final Pattern WORTART_DELIM = Pattern.compile("===",
75 private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
77 private static final Pattern WIKI_QUOTE = Pattern.compile("''",
79 private static final Pattern WIKI_DOUBLE_BRACE = Pattern
80 .compile("\\{\\{([^}]+)\\}\\}");
81 private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
82 .compile("\\[\\[([^\\]]+)\\]\\]");
83 private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE);
86 Wortart("Wortart", null),
88 Aussprache("Aussprache", null),
90 Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")),
92 Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")),
94 Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")),
96 Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")),
98 Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")),
100 Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")),
102 Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")),
104 Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")),
106 CharakteristischeWortkombinationen("Charakteristische Wortkombinationen",
107 Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")),
109 AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern
110 .compile("\\{\\{Abgeleitete Begriffe\\}\\}")),
112 Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")),
114 Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")),
119 final Pattern listPattern;
121 Field(final String name, final Pattern listPattern) {
123 this.listPattern = listPattern;
127 private static final Pattern WORTART = Pattern
128 .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
129 private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
132 private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
134 private void endPage() {
136 StringBuilder text = textBuilder;
137 text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
138 text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
141 StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
142 Pattern.compile("-->", Pattern.LITERAL));
144 String sectionString;
145 while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
146 SECTION_HEADER, false)) != null) {
147 final StringBuilder section = new StringBuilder(sectionString);
149 String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
151 if (wortart.contains("\n") || !wortart.contains("eutsch")) {
152 MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
153 AtomicInteger.class).incrementAndGet();
157 final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
159 wortart = wortart.replaceAll("===", "");
160 wortart = WORTART.matcher(wortart).replaceAll("$1");
161 wortart = GENDER.matcher(wortart).replaceAll("{$1}");
162 wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
163 wortart = wortart.replaceAll("Wortart\\|", "");
164 wortart = wortart.trim();
165 fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
167 String aussprache = StringUtil
168 .remove(section, AUSSPRACHE, NEWLINE, false);
169 if (aussprache != null) {
170 aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
171 aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
172 aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", "");
173 aussprache = aussprache.trim();
174 fieldToValue.put(Field.Aussprache, Collections
175 .singletonList(aussprache));
178 for (final Field field : Field.values()) {
179 if (field.listPattern != null) {
180 fieldToValue.put(field, extractList(section, field.listPattern));
184 System.out.println(titleBuilder);
185 for (final Field field : Field.values()) {
186 if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) {
187 fieldToValue.remove(field);
189 if (field.name != null) {
190 // System.out.println(field.name);
191 // for (final String line : fieldToValue.get(field)) {
192 // System.out.println(" " + line);
197 // System.out.println("WHAT'S LEFT:");
198 // System.out.println(section);
199 // System.out.println("------------------------------------------------");
205 private List<String> extractList(final StringBuilder section,
206 final Pattern start) {
207 final List<String> result = new ArrayList<String>();
208 final String linesString = StringUtil.remove(section, start,
209 WIKI_NEW_SECTION, false);
210 if (linesString != null) {
211 String[] lines = linesString.split("\n");
212 for (int i = 1; i < lines.length; ++i) {
213 String bedeutung = lines[i];
214 bedeutung = bedeutung.replaceFirst("^:+", "");
215 bedeutung = bedeutung.trim();
216 if (bedeutung.length() > 0) {
217 result.add(bedeutung);
224 void parse(final File file) throws ParserConfigurationException,
225 SAXException, IOException {
226 final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
227 parser.parse(file, this);
228 System.out.println(errorCounts);