parser.parse(new BufferedInputStream(in), this);
}
} catch (Exception e) {
- System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+ System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
throw e;
}
private void endPage() {
final String title = titleBuilder.toString();
lastPageTitle = title;
- if (++pageCount % 1000 == 0) {
+ if (++pageCount % 100000 == 0) {
System.out.println("endPage: " + title + ", count=" + pageCount);
}
- if (title.startsWith("Wiktionary:") ||
+ if (title.startsWith("Unsupported titles/")) return;
+ if (title.contains(":")) {
+ if (title.startsWith("Wiktionary:") ||
title.startsWith("Appendix:") ||
title.startsWith("Help:") ||
title.startsWith("Index:") ||
title.startsWith("Rhymes:") ||
title.startsWith("Category:") ||
title.startsWith("Wikisaurus:") ||
- title.startsWith("Unsupported titles/") ||
title.startsWith("Transwiki:") ||
title.startsWith("File:") ||
title.startsWith("Thread:") ||
title.startsWith("Plantilla:") ||
title.startsWith("Wikcionario:") ||
+ // PT
+ title.startsWith("Ajuda:") ||
+ title.startsWith("Apêndice:") ||
+ title.startsWith("Citações:") ||
+ title.startsWith("Portal:") ||
+ title.startsWith("Predefinição:") ||
+ title.startsWith("Vocabulário:") ||
+ title.startsWith("Wikcionário:") ||
+
// sentinel
false
- ) {
- return;
- }
- if (title.contains(":")) {
+ ) return;
if (!title.startsWith("Sign gloss:")) {
System.err.println("title with colon: " + title);
}
}
String text = textBuilder.toString();
+ // Workaround for Spanish wiktionary {{ES}} pattern
+ text = text.replace("{{ES}}", "== {{lengua|es}} ==");
String translingual = "";
int start = 0;
final Matcher startMatcher = headingStart.matcher(text);