package com.hughes.android.dictionary;\r
\r
import java.io.BufferedReader;\r
+import java.io.File;\r
import java.io.FileInputStream;\r
import java.io.FileNotFoundException;\r
import java.io.IOException;\r
import java.io.RandomAccessFile;\r
import java.nio.charset.Charset;\r
import java.util.ArrayList;\r
-import java.util.Arrays;\r
import java.util.Collections;\r
-import java.util.Comparator;\r
-import java.util.HashMap;\r
import java.util.List;\r
import java.util.Map;\r
+import java.util.Random;\r
import java.util.Set;\r
+import java.util.TreeMap;\r
+\r
+import javax.xml.parsers.ParserConfigurationException;\r
+\r
+import org.xml.sax.SAXException;\r
\r
import com.hughes.android.dictionary.Dictionary.IndexEntry;\r
+import com.hughes.android.dictionary.Dictionary.LanguageData;\r
import com.hughes.android.dictionary.Dictionary.Row;\r
+import com.hughes.util.Args;\r
+import com.hughes.util.FileUtil;\r
\r
public class DictionaryBuilder {\r
\r
- static final List<InputFile> inputFiles = Arrays.asList(\r
- new InputFile("c:\\thad\\de-en-chemnitz.txt", Charset.forName("UTF8"), true)\r
- // Thad's extra sauce: \r
-// ,new InputFile("c:\\thad\\de-en-dictcc.txt", Charset.forName("Cp1252"), false)\r
- );\r
- static final String dictOutFilename = "c:\\thad\\de-en.dict";\r
- \r
- static class InputFile {\r
- final String file;\r
- final Charset charset;\r
- final boolean hasMultipleSubentries;\r
- public InputFile(String file, Charset charset, boolean hasMultipleSubentries) {\r
- this.file = file;\r
- this.charset = charset;\r
- this.hasMultipleSubentries = hasMultipleSubentries;\r
+ public static void main(String[] args) throws IOException,\r
+ ClassNotFoundException, ParserConfigurationException, SAXException {\r
+ \r
+ final Map<String,String> keyValueArgs = Args.keyValueArgs(args);\r
+ \r
+ final Language lang1 = Language.lookup(keyValueArgs.remove("lang1"));\r
+ final Language lang2 = Language.lookup(keyValueArgs.remove("lang2"));\r
+ if (lang1 == null || lang2 == null) {\r
+ fatalError("--lang1= and --lang2= must both be specified.");\r
}\r
- }\r
+ \r
+ final String dictOutFilename = keyValueArgs.remove("dictOut");\r
+ if (dictOutFilename == null) {\r
+ fatalError("--dictOut= must be specified.");\r
+ }\r
+ \r
+ String summaryText = keyValueArgs.remove("summaryText");\r
+ if (summaryText == null) {\r
+ fatalError("--summaryText= must be specified.");\r
+ }\r
+ if (summaryText.startsWith("@")) {\r
+ summaryText = FileUtil.readToString(new File(summaryText.substring(1)));\r
+ }\r
+ \r
+ final String maxEntriesString = keyValueArgs.remove("maxEntries");\r
+ final int maxEntries = maxEntriesString == null ? Integer.MAX_VALUE : Integer.parseInt(maxEntriesString);\r
+ \r
+ System.out.println("lang1=" + lang1);\r
+ System.out.println("lang2=" + lang2);\r
+ System.out.println("summaryText=" + summaryText);\r
+ System.out.println("dictOut=" + dictOutFilename); \r
\r
- public static void main(String[] args) throws IOException,\r
- ClassNotFoundException {\r
-\r
- final Dictionary dict = new Dictionary("de-en.txt - a German-English dictionary\n" +\r
- "Version: devel, 2009-08-12\n" +\r
- "Source: http://dict.tu-chemnitz.de/\n" +\r
- "Thanks to Frank Richter.", Language.DE, Language.EN);\r
- System.out.println(Charset.forName("Cp1252"));\r
- for (final InputFile inputFile : inputFiles) {\r
- processInputFile(dict, inputFile);\r
+ final Dictionary dict = new Dictionary(summaryText, lang1, lang2);\r
+\r
+ for (int i = 0; i < 100; ++i) {\r
+ final String prefix = "input" + i;\r
+ if (keyValueArgs.containsKey(prefix)) {\r
+ final File file = new File(keyValueArgs.remove(prefix));\r
+ System.out.println("Processing: " + file);\r
+ String charsetName = keyValueArgs.remove(prefix + "Charset");\r
+ if (charsetName == null) {\r
+ charsetName = "UTF8";\r
+ }\r
+ final Charset charset = Charset.forName(charsetName);\r
+ String inputName = keyValueArgs.remove(prefix + "Name");\r
+ if (inputName == null) {\r
+ fatalError("Must specify human readable name for: " + prefix + "Name");\r
+ }\r
+\r
+ String inputFormat = keyValueArgs.remove(prefix + "Format");\r
+ if ("dictcc".equals(inputFormat)) {\r
+ processLinedInputFile(dict, file, charset, false, maxEntries);\r
+ } else if ("chemnitz".equals(inputFormat)) {\r
+ processLinedInputFile(dict, file, charset, true, maxEntries);\r
+ } else if ("wiktionary".equals(inputFormat)) {\r
+ new WiktionaryXmlParser(dict).parse(file);\r
+ } else {\r
+ fatalError("Invalid or missing input format: " + inputFormat);\r
+ }\r
+ \r
+ dict.sources.add(inputName);\r
+ System.out.println("Done: " + file + "\n\n");\r
+ }\r
+ }\r
+ \r
+ if (!keyValueArgs.isEmpty()) {\r
+ System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);\r
}\r
\r
createIndex(dict, Entry.LANG1);\r
dictOut.setLength(0);\r
dict.write(dictOut);\r
dictOut.close();\r
+ \r
+ final Random random = new Random(0);\r
+ for (byte lang = 0; lang < 2; ++lang) {\r
+ final LanguageData languageData = dict.languageDatas[lang];\r
+ System.out.println("\nRandom words for: " + languageData.language.getSymbol());\r
+ for (int i = 0; i < 10; ++i) {\r
+ final int w = random.nextInt(languageData.sortedIndex.size());\r
+ final IndexEntry entry = languageData.sortedIndex.get(w);\r
+ final List<Row> rows = languageData.rows;\r
+ int r = entry.startRow;\r
+ System.out.println(languageData.rowToString(rows.get(r), false));\r
+ ++r;\r
+ while (r < rows.size() && !rows.get(r).isToken()) {\r
+ System.out.println(" " + languageData.rowToString(rows.get(r), false));\r
+ ++r;\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ private static void fatalError(String string) {\r
+ System.err.println(string);\r
+ System.exit(1);\r
}\r
\r
- private static void processInputFile(final Dictionary dict, final InputFile inputFile) throws FileNotFoundException, IOException {\r
- final BufferedReader dictionaryIn = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile.file), inputFile.charset));\r
+ private static void processLinedInputFile(final Dictionary dict, final File file,\r
+ final Charset charset, final boolean hasMultipleSubentries,\r
+ final int maxEntries) throws FileNotFoundException, IOException {\r
+ final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));\r
String line;\r
int lineCount = 0;\r
- while ((line = dictionaryIn.readLine()) != null) {\r
-// System.out.println(line);\r
+ while ((line = reader.readLine()) != null && lineCount < maxEntries) {\r
+ if (maxEntries < 200) { \r
+ System.out.println(line);\r
+ }\r
line = line.trim();\r
- if (line.isEmpty() || line.startsWith("#")) {\r
+ if (line.equals("") || line.startsWith("#")) {\r
continue;\r
}\r
\r
- final Entry entry = Entry.parseFromLine(line, inputFile.hasMultipleSubentries);\r
+ final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);\r
if (entry == null) {\r
System.err.println("Invalid entry: " + line);\r
continue;\r
}\r
lineCount++;\r
}\r
- dictionaryIn.close();\r
+ reader.close();\r
}\r
\r
public static void createIndex(final Dictionary dict, final byte lang) {\r
System.out.println("Creating index: " + lang);\r
\r
- final Map<String, TokenData> tokenDatas = new HashMap<String, TokenData>();\r
- final EntryData entryDatas[] = new EntryData[dict.entries.size()];\r
+ final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
final Entry entry = dict.entries.get(e);\r
final Set<String> tokens = entry.getIndexableTokens(lang);\r
- entryDatas[e] = new EntryData(tokens.size());\r
for (final String token : tokens) {\r
- TokenData tokenData = tokenDatas.get(token);\r
+ TokenData tokenData = tokenToData.get(token);\r
if (tokenData == null) {\r
tokenData = new TokenData(token);\r
- tokenDatas.put(token, tokenData);\r
+ tokenToData.put(token, tokenData);\r
}\r
- tokenData.entries.add(e);\r
+ tokenData.entries.add(new TokenEntryData(lang, token, entry, e));\r
}\r
\r
if (e % 10000 == 0) {\r
// Sort it.\r
\r
System.out.println("Sorting TokenData...");\r
- final List<TokenData> sortedIndex = new ArrayList<TokenData>(tokenDatas\r
+ final List<TokenData> sortedTokenData = new ArrayList<TokenData>(tokenToData\r
.values());\r
- Collections.sort(sortedIndex, new Comparator<TokenData>() {\r
- @Override\r
- public int compare(TokenData tokenData0, TokenData tokenData1) {\r
- return dict.languageDatas[lang].language.sortComparator.compare(tokenData0.token, tokenData1.token);\r
- }});\r
\r
System.out.println("Sorting entries within each TokenData...");\r
- final Comparator<Integer> entryComparator = new Comparator<Integer>() {\r
- @Override\r
- public int compare(Integer o1, Integer o2) {\r
- // TODO: better this\r
- // Relevant (first token match) chemnitz entries first\r
- // first token position in entry\r
- // entry length in chars\r
- return entryDatas[o1].numTokens < entryDatas[o2].numTokens ? -1\r
- : entryDatas[o1].numTokens == entryDatas[o2].numTokens ? 0 : 1;\r
- }\r
- };\r
- for (final TokenData tokenData : tokenDatas.values()) {\r
- Collections.sort(tokenData.entries, entryComparator);\r
+ for (final TokenData tokenData : sortedTokenData) {\r
+ Collections.sort(tokenData.entries);\r
}\r
\r
// Put it all together.\r
System.out.println("Assembling final data structures...");\r
final List<Row> rows = dict.languageDatas[lang].rows;\r
final List<IndexEntry> indexEntries = dict.languageDatas[lang].sortedIndex;\r
- for (int t = 0; t < sortedIndex.size(); ++t) {\r
- final TokenData tokenData = sortedIndex.get(t);\r
+ for (int t = 0; t < sortedTokenData.size(); ++t) {\r
+ final TokenData tokenData = sortedTokenData.get(t);\r
final int startRow = rows.size();\r
final IndexEntry indexEntry = new IndexEntry(tokenData.token, startRow);\r
indexEntries.add(indexEntry);\r
final Row tokenRow = new Row(-(t + 1));\r
rows.add(tokenRow);\r
\r
- for (final Integer e : tokenData.entries) {\r
- final Row entryRow = new Row(e);\r
+ for (final TokenEntryData entryData : tokenData.entries) {\r
+ final Row entryRow = new Row(entryData.entryIndex);\r
rows.add(entryRow);\r
}\r
}\r
\r
}\r
\r
- static final class EntryData {\r
- final int numTokens;\r
+ static final class TokenEntryData implements Comparable<TokenEntryData> {\r
+ final String token;\r
+ final Entry entry;\r
+ final int entryIndex;\r
+ \r
+ private static final int bigNoOverflow = 100000;\r
+\r
+ int minSubEntryIndexOf = bigNoOverflow;\r
+ int minSubEntryLength = bigNoOverflow;\r
+ int minSubEntry = bigNoOverflow;\r
+\r
+ public TokenEntryData(final byte lang, final String token, final Entry entry, final int entryIndex) {\r
+ this.token = token;\r
+ this.entry = entry;\r
+ this.entryIndex = entryIndex;\r
+ \r
+ final String[] subentries = entry.getAllText(lang);\r
+ for (int s = 0; s < subentries.length; ++s) {\r
+ final String subentry = subentries[s];\r
+ int indexOf = subentry.indexOf(token);\r
+ if (indexOf != -1) {\r
+ minSubEntryIndexOf = Math.min(minSubEntryIndexOf, indexOf); \r
+ minSubEntryLength = Math.min(minSubEntryLength, subentry.length());\r
+ minSubEntry = Math.min(minSubEntry, s);\r
+ }\r
+ }\r
+ }\r
\r
- public EntryData(int numTokens) {\r
- this.numTokens = numTokens;\r
+ @Override\r
+ public int compareTo(final TokenEntryData that) {\r
+ assert this.token.equals(that.token);\r
+ \r
+ if (this.minSubEntryIndexOf != that.minSubEntryIndexOf) {\r
+ return this.minSubEntryIndexOf - that.minSubEntryIndexOf;\r
+ }\r
+ if (this.minSubEntryLength != that.minSubEntryLength) {\r
+ return this.minSubEntryLength - that.minSubEntryLength;\r
+ }\r
+ return this.minSubEntry - that.minSubEntry;\r
}\r
}\r
\r
static final class TokenData {\r
final String token;\r
- final List<Integer> entries = new ArrayList<Integer>();\r
+ final List<TokenEntryData> entries = new ArrayList<TokenEntryData>();\r
\r
int startRow;\r
\r
Entry.parseFromLine("rennen :: run", false));\r
\r
{\r
- final Dictionary dict = new Dictionary("test", Language.DE, Language.EN);\r
+ final Dictionary dict = new Dictionary("test", Language.de, Language.en);\r
dict.entries.addAll(entries);\r
DictionaryBuilder.createIndex(dict, Entry.LANG1);\r
DictionaryBuilder.createIndex(dict, Entry.LANG2);\r
\r
// Hyphenated words get put both multiple listings.\r
\r
- final Dictionary dict = new Dictionary("test", Language.DE, Language.EN);\r
+ final Dictionary dict = new Dictionary("test", Language.de, Language.en);\r
dict.entries.addAll(entries);\r
DictionaryBuilder.createIndex(dict, Entry.LANG1);\r
DictionaryBuilder.createIndex(dict, Entry.LANG2);\r
}\r
\r
public void testGermanSort() {\r
- assertEquals("aüÄ", Language.DE.textNorm("aueAe"));\r
+ assertEquals("aüÄ", Language.de.textNorm("aueAe"));\r
final List<String> words = Arrays.asList(\r
"er-ben",\r
"erben",\r
"Huelle",\r
"Hum"\r
);\r
- assertEquals(0, Language.DE.sortComparator.compare("hülle", "huelle"));\r
- assertEquals(0, Language.DE.sortComparator.compare("huelle", "hülle"));\r
+ assertEquals(0, Language.de.sortComparator.compare("hülle", "huelle"));\r
+ assertEquals(0, Language.de.sortComparator.compare("huelle", "hülle"));\r
\r
- assertEquals(-1, Language.DE.sortComparator.compare("hülle", "Hülle"));\r
- assertEquals(0, Language.DE.findComparator.compare("hülle", "Hülle"));\r
- assertEquals(-1, Language.DE.findComparator.compare("hulle", "Hülle"));\r
+ assertEquals(-1, Language.de.sortComparator.compare("hülle", "Hülle"));\r
+ assertEquals(0, Language.de.findComparator.compare("hülle", "Hülle"));\r
+ assertEquals(-1, Language.de.findComparator.compare("hulle", "Hülle"));\r
\r
\r
for (final String s : words) {\r
- System.out.println(s + "\t" + Language.DE.textNorm(s));\r
+ System.out.println(s + "\t" + Language.de.textNorm(s));\r
}\r
final List<String> sorted = new ArrayList<String>(words);\r
// Collections.shuffle(shuffled, new Random(0));\r
- Collections.sort(sorted, Language.DE.sortComparator);\r
+ Collections.sort(sorted, Language.de.sortComparator);\r
System.out.println(sorted.toString());\r
for (int i = 0; i < words.size(); ++i) {\r
System.out.println(words.get(i) + "\t" + sorted.get(i));\r
"preprocess");\r
\r
final List<String> sorted = new ArrayList<String>(words);\r
- Collections.sort(sorted, Language.EN.sortComparator);\r
+ Collections.sort(sorted, Language.en.sortComparator);\r
for (int i = 0; i < words.size(); ++i) {\r
if (i > 0) {\r
- assertTrue(Language.EN.sortComparator.compare(words.get(i-1), words.get(i)) < 0);\r
+ assertTrue(Language.en.sortComparator.compare(words.get(i-1), words.get(i)) < 0);\r
}\r
System.out.println(words.get(i) + "\t" + sorted.get(i));\r
assertEquals(words.get(i), sorted.get(i));\r
}\r
\r
- assertTrue(Language.EN.sortCollator.compare("pre-print", "preppy") < 0);\r
+ assertTrue(Language.en.sortCollator.compare("pre-print", "preppy") < 0);\r
\r
}\r
+ \r
+ public void testLanguage() {\r
+ System.out.println("languages=" + Language.symbolToLangauge.values());\r
+ assertEquals(Language.de, Language.lookup("de"));\r
+ assertEquals(Language.en, Language.lookup("en"));\r
+ assertEquals("es", Language.lookup("es").symbol);\r
+ }\r
\r
}\r
--- /dev/null
+package com.hughes.android.dictionary;
+
+import java.io.File;
+
+public interface InputParser {
+
+ void parse(final File file, final Dictionary dest);
+
+ class LineParser implements InputParser {
+ @Override
+ public void parse(File file, Dictionary dest) {
+ }
+ }
+
+}
+++ /dev/null
-package com.hughes.android.dictionary;\r
-\r
-public final class StringUtil {\r
-\r
- public static String longestCommonSubstring(final String s1, final String s2) {\r
- for (int i = 0; i < s1.length() && i < s2.length(); i++) {\r
- if (s1.charAt(i) != s2.charAt(i)) {\r
- return s1.substring(0, i);\r
- }\r
- }\r
- return s1.length() < s2.length() ? s1 : s2;\r
- }\r
-\r
-}\r
--- /dev/null
+package com.hughes.android.dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import com.hughes.util.MapUtil;
+import com.hughes.util.StringUtil;
+
+public class WiktionaryXmlParser extends org.xml.sax.helpers.DefaultHandler {
+
+ final Dictionary dict;
+
+ StringBuilder titleBuilder;
+ StringBuilder textBuilder;
+ StringBuilder currentBuilder = null;
+
+ public WiktionaryXmlParser(final Dictionary dict) {
+ this.dict = dict;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes attributes) {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ titleBuilder = new StringBuilder();
+ textBuilder = new StringBuilder();
+ } else if ("title".equals(qName)) {
+ currentBuilder = titleBuilder;
+ } else if ("text".equals(qName)) {
+ currentBuilder = textBuilder;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (currentBuilder != null) {
+ currentBuilder.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ currentBuilder = null;
+ if ("page".equals(qName)) {
+ endPage();
+ }
+ }
+
+ private static final Pattern NEWLINE = Pattern.compile("\n", Pattern.LITERAL);
+
+ // MULTILINE for ^
+ private static final Pattern SECTION_HEADER = Pattern
+ .compile("=== *\\{\\{Wortart\\|");
+
+ private static final Pattern WORTART_DELIM = Pattern.compile("===",
+ Pattern.LITERAL);
+ private static final Pattern GENDER = Pattern.compile("\\{\\{([mfn])\\}\\}");
+
+ private static final Pattern WIKI_QUOTE = Pattern.compile("''",
+ Pattern.LITERAL);
+ private static final Pattern WIKI_DOUBLE_BRACE = Pattern
+ .compile("\\{\\{([^}]+)\\}\\}");
+ private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
+ .compile("\\[\\[([^\\]]+)\\]\\]");
+ private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=");
+
+ enum Field {
+ Wortart("Wortart", null), Aussprache("Aussprache", null), Bedeutungen(
+ "Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), Synonome(
+ "Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), Gegenworte(
+ "Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), Oberbegriffe(
+ "Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), Unterbegriffe(
+ "Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), Beispiele(
+ "Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), Redewendungen(
+ "Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), CharakteristischeWortkombinationen(
+ "Charakteristische Wortkombinationen", Pattern
+ .compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), AbgeleiteteBegriffe(
+ "Abgeleitete Begriffe", Pattern
+ .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), Herkunft("Herkunft",
+ Pattern.compile("\\{\\{Herkunft\\}\\}"));
+
+ final String name;
+ final Pattern listPattern;
+
+ Field(final String name, final Pattern listPattern) {
+ this.name = name;
+ this.listPattern = listPattern;
+ }
+ }
+
+ private static final Pattern WORTART = Pattern
+ .compile("\\{\\{Wortart\\|([^}]+)\\|([^}]+)\\}\\}");
+ private static final Pattern AUSSPRACHE = Pattern.compile(":Hilfe:IPA|IPA:",
+ Pattern.LITERAL);
+
+ private final Map<String, AtomicInteger> errorCounts = new TreeMap<String, AtomicInteger>();
+
+ private void endPage() {
+
+ StringBuilder text = textBuilder;
+ text = new StringBuilder(WIKI_QUOTE.matcher(text).replaceAll("\""));
+ text = new StringBuilder(WIKI_DOUBLE_BRACKET.matcher(text).replaceAll("$1"));
+
+ // Remove comments.
+ StringUtil.removeAll(text, Pattern.compile("<!--", Pattern.LITERAL),
+ Pattern.compile("-->", Pattern.LITERAL));
+
+ String sectionString;
+ while ((sectionString = StringUtil.remove(text, SECTION_HEADER,
+ SECTION_HEADER, false)) != null) {
+ final StringBuilder section = new StringBuilder(sectionString);
+
+ String wortart = StringUtil.remove(section, WORTART_DELIM, WORTART_DELIM,
+ true);
+ if (wortart.contains("\n") || !wortart.contains("eutsch")) {
+ MapUtil.safeGet(errorCounts, "Invalid wortart: " + wortart,
+ AtomicInteger.class).incrementAndGet();
+ continue;
+ }
+
+ final LinkedHashMap<Field, List<String>> fieldToValue = new LinkedHashMap<Field, List<String>>();
+
+ wortart = wortart.replaceAll("===", "");
+ wortart = WORTART.matcher(wortart).replaceAll("$1");
+ wortart = GENDER.matcher(wortart).replaceAll("{$1}");
+ wortart = WIKI_DOUBLE_BRACE.matcher(wortart).replaceAll("$1");
+ wortart = wortart.replaceAll("Wortart\\|", "");
+ wortart = wortart.trim();
+ fieldToValue.put(Field.Wortart, Collections.singletonList(wortart));
+
+ String aussprache = StringUtil
+ .remove(section, AUSSPRACHE, NEWLINE, false);
+ if (aussprache != null) {
+ aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
+ aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
+ aussprache = aussprache.replaceAll("Lautschrift\\|", "");
+ aussprache = aussprache.trim();
+ fieldToValue.put(Field.Aussprache, Collections
+ .singletonList(aussprache));
+ }
+
+ for (final Field field : Field.values()) {
+ if (field.listPattern != null) {
+ fieldToValue.put(field, extractList(section, field.listPattern));
+ }
+ }
+
+ System.out.println(titleBuilder);
+ for (final Field field : Field.values()) {
+ if (fieldToValue.get(field).isEmpty()) {
+ fieldToValue.remove(field);
+ } else {
+ System.out.println(field.name);
+ for (final String line : fieldToValue.get(field)) {
+ System.out.println(" " + line);
+ }
+ }
+ }
+ System.out.println("WHAT'S LEFT:");
+ System.out.println(section);
+ System.out.println("------------------------------------------------");
+
+ }
+
+ // System.out.println(titleBuilder);
+ /*
+ * final List<String> pronunciations = new ArrayList<String>(); final
+ * CharSequence pronunciationSeq = getSection(text, PRONUNCIATION,
+ * SECTION_START); if (pronunciationSeq != null) { final Matcher
+ * pronunciationMatcher = PRONUNCIATION_EXAMPLE.matcher(pronunciationSeq);
+ * while (pronunciationMatcher.find()) {
+ * pronunciations.add(pronunciationMatcher.group(1)); }
+ * System.out.println("PRONUNCIATIONS:" + pronunciations); }
+ *
+ * String[] meanings = null; final CharSequence meaningsSeq =
+ * getSection(text, MEANINGS, SECTION_START); if (meaningsSeq != null) {
+ * meanings = LIST.split(meaningsSeq); meanings[0] = "";
+ * System.out.println("MEANINGS:" + Arrays.toString(meanings)); }
+ *
+ * System.out.println(text);
+ */
+
+ }
+
+ private List<String> extractList(final StringBuilder section,
+ final Pattern start) {
+ final List<String> result = new ArrayList<String>();
+ final String linesString = StringUtil.remove(section, start,
+ WIKI_DOUBLE_BRACE, false);
+ if (linesString != null) {
+ String[] lines = linesString.split("\n");
+ for (int i = 1; i < lines.length; ++i) {
+ String bedeutung = lines[i];
+ bedeutung = bedeutung.replaceFirst("^:", "");
+ bedeutung = bedeutung.trim();
+ if (bedeutung.length() > 0) {
+ result.add(bedeutung);
+ }
+ }
+ }
+ return result;
+ }
+
+ private static CharSequence getSection(CharSequence input, Pattern start,
+ Pattern end) {
+ Matcher startMatcher = start.matcher(input);
+ if (!startMatcher.find()) {
+ return null;
+ }
+ Matcher endMatcher = end.matcher(input);
+ if (!endMatcher.find(startMatcher.end())) {
+ return input.subSequence(startMatcher.start(), input.length());
+ }
+ return input.subSequence(startMatcher.start(), endMatcher.start());
+ }
+
+ void parse(final File file) throws ParserConfigurationException,
+ SAXException, IOException {
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
+ parser.parse(file, this);
+ System.out.println(errorCounts);
+ }
+
+}