System.err.println("WARNING: couldn't parse arguments: " + keyValueArgs);\r
}\r
\r
- createIndex(dict, Entry.LANG1);\r
- createIndex(dict, Entry.LANG2);\r
+ createIndex(dict, SimpleEntry.LANG1);\r
+ createIndex(dict, SimpleEntry.LANG2);\r
\r
System.out.println("Writing dictionary.");\r
final RandomAccessFile dictOut = new RandomAccessFile(dictOutFilename, "rw");\r
for (byte lang = 0; lang < 2; ++lang) {\r
final LanguageData languageData = dict.languageDatas[lang];\r
System.out.println("\nRandom words for: " + languageData.language.getSymbol());\r
- for (int i = 0; i < 10; ++i) {\r
+ for (int i = 0; i < 20; ++i) {\r
final int w = random.nextInt(languageData.sortedIndex.size());\r
final IndexEntry entry = languageData.sortedIndex.get(w);\r
final List<Row> rows = languageData.rows;\r
continue;\r
}\r
\r
- final Entry entry = Entry.parseFromLine(line, hasMultipleSubentries);\r
+ final SimpleEntry entry = SimpleEntry.parseFromLine(line, hasMultipleSubentries);\r
if (entry == null) {\r
System.err.println("Invalid entry: " + line);\r
continue;\r
final Map<String, TokenData> tokenToData = new TreeMap<String, TokenData>(dict.languageDatas[lang].language.sortComparator);\r
\r
for (int e = 0; e < dict.entries.size(); ++e) {\r
- final Entry entry = dict.entries.get(e);\r
+ final SimpleEntry entry = dict.entries.get(e);\r
final Set<String> tokens = entry.getIndexableTokens(lang);\r
for (final String token : tokens) {\r
TokenData tokenData = tokenToData.get(token);\r
\r
static final class TokenEntryData implements Comparable<TokenEntryData> {\r
final String token;\r
- final Entry entry;\r
+ final SimpleEntry entry;\r
final int entryIndex;\r
\r
private static final int bigNoOverflow = 100000;\r
int minSubEntryLength = bigNoOverflow;\r
int minSubEntry = bigNoOverflow;\r
\r
- public TokenEntryData(final byte lang, final String token, final Entry entry, final int entryIndex) {\r
+ public TokenEntryData(final byte lang, final String token, final SimpleEntry entry, final int entryIndex) {\r
this.token = token;\r
this.entry = entry;\r
this.entryIndex = entryIndex;\r
file.deleteOnExit();\r
\r
// final Dictionary goldenDict;\r
- final List<Entry> entries = Arrays.asList(\r
- Entry.parseFromLine("der Hund :: the dog", false),\r
- Entry.parseFromLine("Die grosse Katze :: The big cat", false), \r
- Entry.parseFromLine("die Katze :: the cat", false),\r
- Entry.parseFromLine("gross :: big", false),\r
- Entry.parseFromLine("Dieb :: thief", false),\r
- Entry.parseFromLine("rennen :: run", false));\r
+ final List<SimpleEntry> entries = Arrays.asList(\r
+ SimpleEntry.parseFromLine("der Hund :: the dog", false),\r
+ SimpleEntry.parseFromLine("Die grosse Katze :: The big cat", false), \r
+ SimpleEntry.parseFromLine("die Katze :: the cat", false),\r
+ SimpleEntry.parseFromLine("gross :: big", false),\r
+ SimpleEntry.parseFromLine("Dieb :: thief", false),\r
+ SimpleEntry.parseFromLine("rennen :: run", false));\r
\r
{\r
final Dictionary dict = new Dictionary("test", Language.de, Language.en);\r
dict.entries.addAll(entries);\r
- DictionaryBuilder.createIndex(dict, Entry.LANG1);\r
- DictionaryBuilder.createIndex(dict, Entry.LANG2);\r
+ DictionaryBuilder.createIndex(dict, SimpleEntry.LANG1);\r
+ DictionaryBuilder.createIndex(dict, SimpleEntry.LANG2);\r
final RandomAccessFile raf = new RandomAccessFile(file, "rw");\r
dict.write(raf);\r
raf.close();\r
\r
public void testTextNorm() throws IOException {\r
System.out.println("\n\ntestTextNorm");\r
- final List<Entry> entries = Arrays.asList(\r
- Entry.parseFromLine("Hund {m} :: dog", true),\r
- Entry.parseFromLine("'CHRISTOS' :: doh", true),\r
- Entry.parseFromLine("\"Pick-up\"-Presse {f} :: baler", true),\r
- Entry.parseFromLine("(Ach was), echt? [auch ironisch] :: No shit! [also ironic]", true),\r
- Entry.parseFromLine("(akuter) Myokardinfarkt {m} <AMI / MI> :: (acute) myocardial infarction <AMI / MI>", true),\r
- Entry.parseFromLine("(reine) Vermutung {f} :: guesswork", true),\r
- Entry.parseFromLine("(mit) 6:1 vorne liegen :: to be 6-1 up [football]", true),\r
- Entry.parseFromLine("(auf) den Knopf drücken [auch fig.: auslösen] :: to push the button [also fig.: initiate]", false),\r
- Entry.parseFromLine("Adjektiv {n} /Adj./; Eigenschaftswort {n} [gramm.] | Adjektive {pl}; Eigenschaftswoerter {pl} :: adjective /adj./ | adjectives", true),\r
- Entry.parseFromLine("Älteste {m,f}; Ältester :: oldest; eldest", true),\r
- Entry.parseFromLine("\"...\", schloss er an. :: '...,' he added.", true),\r
- Entry.parseFromLine("besonderer | besondere | besonderes :: extra", false),\r
- Entry.parseFromLine("| zu Pferde; zu Pferd | reiten :: horseback | on horseback | go on horseback", true),\r
- Entry.parseFromLine("Hauptaugenmerk {m} | sein Hauptaugenmerk richten auf :: | to focus (one's) attention on", true),\r
- Entry.parseFromLine("σ-Algebra {f} :: σ-field", true)\r
+ final List<SimpleEntry> entries = Arrays.asList(\r
+ SimpleEntry.parseFromLine("Hund {m} :: dog", true),\r
+ SimpleEntry.parseFromLine("'CHRISTOS' :: doh", true),\r
+ SimpleEntry.parseFromLine("\"Pick-up\"-Presse {f} :: baler", true),\r
+ SimpleEntry.parseFromLine("(Ach was), echt? [auch ironisch] :: No shit! [also ironic]", true),\r
+ SimpleEntry.parseFromLine("(akuter) Myokardinfarkt {m} <AMI / MI> :: (acute) myocardial infarction <AMI / MI>", true),\r
+ SimpleEntry.parseFromLine("(reine) Vermutung {f} :: guesswork", true),\r
+ SimpleEntry.parseFromLine("(mit) 6:1 vorne liegen :: to be 6-1 up [football]", true),\r
+ SimpleEntry.parseFromLine("(auf) den Knopf drücken [auch fig.: auslösen] :: to push the button [also fig.: initiate]", false),\r
+ SimpleEntry.parseFromLine("Adjektiv {n} /Adj./; Eigenschaftswort {n} [gramm.] | Adjektive {pl}; Eigenschaftswoerter {pl} :: adjective /adj./ | adjectives", true),\r
+ SimpleEntry.parseFromLine("Älteste {m,f}; Ältester :: oldest; eldest", true),\r
+ SimpleEntry.parseFromLine("\"...\", schloss er an. :: '...,' he added.", true),\r
+ SimpleEntry.parseFromLine("besonderer | besondere | besonderes :: extra", false),\r
+ SimpleEntry.parseFromLine("| zu Pferde; zu Pferd | reiten :: horseback | on horseback | go on horseback", true),\r
+ SimpleEntry.parseFromLine("Hauptaugenmerk {m} | sein Hauptaugenmerk richten auf :: | to focus (one's) attention on", true),\r
+ SimpleEntry.parseFromLine("σ-Algebra {f} :: σ-field", true)\r
);\r
\r
assertFalse(entries.contains(null));\r
\r
final Dictionary dict = new Dictionary("test", Language.de, Language.en);\r
dict.entries.addAll(entries);\r
- DictionaryBuilder.createIndex(dict, Entry.LANG1);\r
- DictionaryBuilder.createIndex(dict, Entry.LANG2);\r
+ DictionaryBuilder.createIndex(dict, SimpleEntry.LANG1);\r
+ DictionaryBuilder.createIndex(dict, SimpleEntry.LANG2);\r
\r
for (int lang = 0; lang <= 1; lang++) {\r
final LanguageData languageData = dict.languageDatas[lang];\r
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
.compile("\\{\\{([^}]+)\\}\\}");
private static final Pattern WIKI_DOUBLE_BRACKET = Pattern
.compile("\\[\\[([^\\]]+)\\]\\]");
- private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=");
+ private static final Pattern WIKI_NEW_SECTION = Pattern.compile("^\\{\\{([^}]+)\\}\\}|^=", Pattern.MULTILINE);
enum Field {
- Wortart("Wortart", null), Aussprache("Aussprache", null), Bedeutungen(
- "Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")), Synonome(
- "Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")), Gegenworte(
- "Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")), Oberbegriffe(
- "Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")), Unterbegriffe(
- "Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")), Beispiele(
- "Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")), Redewendungen(
- "Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")), CharakteristischeWortkombinationen(
- "Charakteristische Wortkombinationen", Pattern
- .compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")), AbgeleiteteBegriffe(
- "Abgeleitete Begriffe", Pattern
- .compile("\\{\\{Abgeleitete Begriffe\\}\\}")), Herkunft("Herkunft",
- Pattern.compile("\\{\\{Herkunft\\}\\}"));
+ Wortart("Wortart", null),
+
+ Aussprache("Aussprache", null),
+
+ Bedeutungen("Bedeutungen", Pattern.compile("\\{\\{Bedeutungen\\}\\}")),
+
+ Verkleinerungsformen("Verkleinerungsformen", Pattern.compile("\\{\\{Verkleinerungsformen\\}\\}")),
+
+ Synonome("Synonyme", Pattern.compile("\\{\\{Synonyme\\}\\}")),
+
+ Gegenworte("Gegenworte", Pattern.compile("\\{\\{Gegenworte\\}\\}")),
+
+ Oberbegriffe("Oberbegriffe", Pattern.compile("\\{\\{Oberbegriffe\\}\\}")),
+
+ Unterbegriffe("Unterbegriffe", Pattern.compile("\\{\\{Unterbegriffe\\}\\}")),
+
+ Beispiele("Beispiele", Pattern.compile("\\{\\{Beispiele\\}\\}")),
+
+ Redewendungen("Redewendungen", Pattern.compile("\\{\\{Redewendungen\\}\\}")),
+
+ CharakteristischeWortkombinationen("Charakteristische Wortkombinationen",
+ Pattern.compile("\\{\\{Charakteristische Wortkombinationen\\}\\}")),
+
+ AbgeleiteteBegriffe("Abgeleitete Begriffe", Pattern
+ .compile("\\{\\{Abgeleitete Begriffe\\}\\}")),
+
+ Herkunft("Herkunft", Pattern.compile("\\{\\{Herkunft\\}\\}")),
+
+ Silbentrennung(null, Pattern.compile("\\{\\{Silbentrennung\\}\\}")),
+
+ ;
final String name;
final Pattern listPattern;
if (aussprache != null) {
aussprache = AUSSPRACHE.matcher(aussprache).replaceFirst("");
aussprache = WIKI_DOUBLE_BRACE.matcher(aussprache).replaceAll("$1");
- aussprache = aussprache.replaceAll("Lautschrift\\|", "");
+ aussprache = aussprache.replaceAll("Lautschrift\\|ˈ?", "");
aussprache = aussprache.trim();
fieldToValue.put(Field.Aussprache, Collections
.singletonList(aussprache));
System.out.println(titleBuilder);
for (final Field field : Field.values()) {
- if (fieldToValue.get(field).isEmpty()) {
+ if (!fieldToValue.containsKey(field) || fieldToValue.get(field).isEmpty()) {
fieldToValue.remove(field);
} else {
- System.out.println(field.name);
- for (final String line : fieldToValue.get(field)) {
- System.out.println(" " + line);
+ if (field.name != null) {
+// System.out.println(field.name);
+// for (final String line : fieldToValue.get(field)) {
+// System.out.println(" " + line);
+// }
}
}
}
- System.out.println("WHAT'S LEFT:");
- System.out.println(section);
- System.out.println("------------------------------------------------");
+// System.out.println("WHAT'S LEFT:");
+// System.out.println(section);
+// System.out.println("------------------------------------------------");
}
- // System.out.println(titleBuilder);
- /*
- * final List<String> pronunciations = new ArrayList<String>(); final
- * CharSequence pronunciationSeq = getSection(text, PRONUNCIATION,
- * SECTION_START); if (pronunciationSeq != null) { final Matcher
- * pronunciationMatcher = PRONUNCIATION_EXAMPLE.matcher(pronunciationSeq);
- * while (pronunciationMatcher.find()) {
- * pronunciations.add(pronunciationMatcher.group(1)); }
- * System.out.println("PRONUNCIATIONS:" + pronunciations); }
- *
- * String[] meanings = null; final CharSequence meaningsSeq =
- * getSection(text, MEANINGS, SECTION_START); if (meaningsSeq != null) {
- * meanings = LIST.split(meaningsSeq); meanings[0] = "";
- * System.out.println("MEANINGS:" + Arrays.toString(meanings)); }
- *
- * System.out.println(text);
- */
-
}
private List<String> extractList(final StringBuilder section,
final Pattern start) {
final List<String> result = new ArrayList<String>();
final String linesString = StringUtil.remove(section, start,
- WIKI_DOUBLE_BRACE, false);
+ WIKI_NEW_SECTION, false);
if (linesString != null) {
String[] lines = linesString.split("\n");
for (int i = 1; i < lines.length; ++i) {
String bedeutung = lines[i];
- bedeutung = bedeutung.replaceFirst("^:", "");
+ bedeutung = bedeutung.replaceFirst("^:+", "");
bedeutung = bedeutung.trim();
if (bedeutung.length() > 0) {
result.add(bedeutung);
return result;
}
- private static CharSequence getSection(CharSequence input, Pattern start,
- Pattern end) {
- Matcher startMatcher = start.matcher(input);
- if (!startMatcher.find()) {
- return null;
- }
- Matcher endMatcher = end.matcher(input);
- if (!endMatcher.find(startMatcher.end())) {
- return input.subSequence(startMatcher.start(), input.length());
- }
- return input.subSequence(startMatcher.start(), endMatcher.start());
- }
-
void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();