public static void main(final String[] args) throws Exception {
-
DictionaryBuilder.main(new String[] {
"--dictOut=dictOutputs/DE-EN_chemnitz.quickdic",
"--lang1=DE",
"--input1Format=chemnitz",
});
+
Lang[] langs1 = new Lang[] {
new Lang("^English$", "EN"),
new Lang("^German$", "DE"),
String.format("--lang2=%s", lang2.code),
String.format("--dictInfo=(EN)Wikitionary-based %s-%s dictionary", lang1.code, lang2.code),
- "--input1=dictInputs/enwiktionary-20101015-pages-articles",
+ "--input1=dictInputs/enwiktionary-20110205-pages-articles.xml",
"--input1Name=enwiktionary",
"--input1Format=enwiktionary",
String.format("--input1TranslationPattern1=%s", lang1.nameRegex),
static final Set<String> useRemainingArgTemplates = new LinkedHashSet<String>(Arrays.asList(
"Arab", "Cyrl", "fa-Arab", "italbrac", "Khmr", "ku-Arab", "IPAchar", "Laoo",
"sd-Arab", "Thai", "ttbc", "unicode", "ur-Arab", "yue-yue-j", "zh-ts",
- "zh-tsp", "zh-zh-p"));
- static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList(""));
- static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf"));
+ "zh-tsp", "zh-zh-p", "ug-Arab", "ko-inline", "Jpan", "Kore", "rfscript", "Latinx"));
+ static final Set<String> ignoreTemplates = new LinkedHashSet<String>(Arrays.asList("audio", "rhymes", "hyphenation", "homophones", "wikipedia", "rel-top", "rel-bottom", "sense", "wikisource1911Enc", "g"));
+ static final Set<String> grammarTemplates = new LinkedHashSet<String>(Arrays.asList("impf", "pf", "pf.", "indeclinable"));
static final Set<String> passThroughTemplates = new LinkedHashSet<String>(Arrays.asList("zzzzzzzzzzzzzzz"));
@Override
return;
}
- if (name.equals("audio") || name.equals("rhymes") || name.equals("hyphenation")) {
+ if (ignoreTemplates.contains(name)) {
return;
}
wikiBuilder.append("sg.");
} else if (grammarTemplates.contains(name)) {
- assert positionalArgs.size() == 1 && namedArgs.isEmpty();
+ assert positionalArgs.size() == 1 && namedArgs.isEmpty() : positionalArgs.toString() + namedArgs;
wikiBuilder.append(name).append(".");
} else if (name.equals("l")) {
@Override
public void onUnterminated(String start, String rest) {
- throw new RuntimeException(start + rest);
+ System.err.printf("OnUnterminated: %s %s %s\n", title, start, rest);
}
@Override
public void onInvalidHeaderEnd(String rest) {
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.hughes.util.StringUtil;
+
public class WikiParser {
private static final Pattern markup = Pattern.compile("$|''|\\{\\{|\\[\\[|(==+)\\s*$|<!--|<pre>", Pattern.MULTILINE);
callback.onFormatItalic(italicOn);
rest = rest.substring(2);
} else if (rest.startsWith("{{")) {
- int end = rest.indexOf("}}");
+ int end = StringUtil.nestedIndexOf(rest, 2, "{{", "}}");
if (end == -1) {
callback.onUnterminated("{{", rest);
- return;
+ end = StringUtil.safeIndexOf(rest, "\n") - 2;
}
final String template = rest.substring(2, end).trim();
- //todo: this doesn't work. can't split pipes inside [[asdf|asdf]]
final List<String> templateArray = new ArrayList<String>();
contextSensitivePipeSplit(template, templateArray);
positionalArgs.clear();
int end = rest.indexOf("]]");
if (end == -1) {
callback.onUnterminated("[[", rest);
- return;
+ end = StringUtil.safeIndexOf(rest, "\n") - 2;
}
final String wikiLink = rest.substring(2, end);
final String[] args = pipeSplit.split(wikiLink);
int end = rest.indexOf("-->");
if (end == -1) {
callback.onUnterminated("<!--", rest);
- return;
+ end = StringUtil.safeIndexOf(rest, "\n") - 3;
}
callback.onComment(rest.substring(4, end));
rest = rest.substring(end + 3);
int end = rest.indexOf("</pre>");
if (end == -1) {
callback.onUnterminated("<pre>", rest);
- return;
+ end = StringUtil.safeIndexOf(rest, "\n") - 6;
}
callback.onText(rest.substring(5, end));
rest = rest.substring(end + 6);
}
}
- private static final Pattern openBracketOrPipe = Pattern.compile("($)|(\\[\\[)|(\\s*\\|\\s*)");
private static void contextSensitivePipeSplit(String template, final List<String> result) {
- StringBuilder builder = new StringBuilder();
- while (template.length() > 0) {
- final Matcher matcher = openBracketOrPipe.matcher(template);
- if (matcher.find()) {
- // append to the match.
- builder.append(template.substring(0, matcher.start()));
- if (matcher.group(2) != null) { // [[
- // append to the close ]].
- final int closeIndex = template.indexOf("]]", matcher.end());
- builder.append(template.substring(matcher.start(), closeIndex + 2));
- template = template.substring(closeIndex + 2);
- } else if (matcher.group(3) != null) { // |
- result.add(builder.toString());
- builder = new StringBuilder();
- template = template.substring(matcher.end());
- } else {
- template = template.substring(matcher.start());
- assert template.length() == 0 : template;
+ int depth = 0;
+ int lastStart = 0;
+ for (int i = 1; i < template.length(); ) {
+ if (template.charAt(i) == '|' && depth == 0) {
+ final String s = template.substring(lastStart, i);
+ result.add(s.trim());
+ ++i;
+ lastStart = i;
+ } else if (template.startsWith("[[", i) || template.startsWith("{{", i)) {
+ ++depth;
+ i += 2;
+ } else if (template.startsWith("]]", i) || template.startsWith("}}", i)) {
+ --depth;
+ if (depth < 0) {
+ throw new RuntimeException("too many closings: " + template);
}
+ i += 2;
} else {
- assert false;
+ ++i;
}
}
- result.add(builder.toString());
+ result.add(template.substring(lastStart).trim());
}
// ------------------------------------------------------------------------
@Override
public void onUnterminated(String start, String rest) {
- throw new RuntimeException(start + rest);
+ System.err.printf("onUnterminated: %s, %s\n", start, rest);
}
@Override
"here's a {{template|this has an = sign|blah=2|blah2=3|" + "\n" +
"blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}} and some more text." + "\n" +
"== Header 2 ==" + "\n" +
+ "{{some-func|blah={{nested-func|n2}}|blah2=asdf}}" + "\n" +
+ "{{unterminated}" + "\n" +
// "==== Header 4 ====" + "\n" +
// "===== Header 5 =====" + "\n" +
"=== {{header-template}} ===" + "\n";
"template:[template, this has an = sign]{blah=2, blah2=3, blah3=3,[[asdf]|[asdf asdf]|[asdf asdf asdf]],blah4=4}\n" +
" and some more text.\n" +
"HEADER Header 2 \n" +
+ " \n" +
+ "template:[some-func]{blah={{nested-func|n2}}, blah2=asdf}\n" +
+ " \n" +
+ "template:[unterminate]{}" + "\n" +
"\n" +
"HEADER \n" +
"template:[header-template]{}\n" +
@Override
public void onUnterminated(String start, String rest) {
- throw new RuntimeException("bad");
+ //throw new RuntimeException("bad");
}
@Override