import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
}
private void go() throws Exception {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
// Configure things.
for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
parser.parse(new BufferedInputStream(in), this);
}
} catch (Exception e) {
- System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+ System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
throw e;
}
private void endPage() {
final String title = titleBuilder.toString();
lastPageTitle = title;
- if (++pageCount % 1000 == 0) {
+ if (++pageCount % 100000 == 0) {
System.out.println("endPage: " + title + ", count=" + pageCount);
}
- if (title.startsWith("Wiktionary:") ||
+ if (title.startsWith("Unsupported titles/")) return;
+ if (title.contains(":")) {
+ if (title.startsWith("Wiktionary:") ||
title.startsWith("Appendix:") ||
title.startsWith("Help:") ||
title.startsWith("Index:") ||
title.startsWith("Rhymes:") ||
title.startsWith("Category:") ||
title.startsWith("Wikisaurus:") ||
- title.startsWith("Unsupported titles/") ||
title.startsWith("Transwiki:") ||
title.startsWith("File:") ||
title.startsWith("Thread:") ||
title.startsWith("Template:") ||
title.startsWith("Summary:") ||
title.startsWith("Module:") ||
+ title.startsWith("Reconstruction:") ||
// DE
title.startsWith("Datei:") ||
title.startsWith("Verzeichnis:") ||
title.startsWith("Kategorie:") ||
title.startsWith("Hilfe:") ||
title.startsWith("Reim:") ||
+ title.startsWith("Modul:") ||
// FR:
title.startsWith("Annexe:") ||
title.startsWith("Catégori:") ||
title.startsWith("Aide:") ||
title.startsWith("Fichier:") ||
title.startsWith("Wiktionnaire:") ||
+ title.startsWith("Translations:Wiktionnaire:") ||
+ title.startsWith("Translations:Projet:") ||
title.startsWith("Catégorie:") ||
title.startsWith("Portail:") ||
title.startsWith("utiliusateur:") ||
title.startsWith("Kategorio:") ||
+ title.startsWith("Tutoriel:") ||
// IT
title.startsWith("Wikizionario:") ||
title.startsWith("Appendice:") ||
title.startsWith("Categoria:") ||
title.startsWith("Aiuto:") ||
title.startsWith("Portail:") ||
+ title.startsWith("Modulo:") ||
// ES
title.startsWith("Apéndice:") ||
title.startsWith("Archivo:") ||
title.startsWith("Plantilla:") ||
title.startsWith("Wikcionario:") ||
+ // PT
+ title.startsWith("Ajuda:") ||
+ title.startsWith("Apêndice:") ||
+ title.startsWith("Citações:") ||
+ title.startsWith("Portal:") ||
+ title.startsWith("Predefinição:") ||
+ title.startsWith("Vocabulário:") ||
+ title.startsWith("Wikcionário:") ||
+ title.startsWith("Módulo:") ||
+
// sentinel
false
- ) {
- return;
- }
- if (title.contains(":")) {
- if (!title.startsWith("Sign gloss:")) {
+ ) return;
+ // leave the Flexion: pages in for now and do not warn about them
+ if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
System.err.println("title with colon: " + title);
}
}
String text = textBuilder.toString();
+ // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
+ text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} ==");
String translingual = "";
+ int start = 0;
+ final Matcher startMatcher = headingStart.matcher(text);
- while (text.length() > 0) {
+ while (start < text.length()) {
// Find start.
- final Matcher startMatcher = headingStart.matcher(text);
- if (!startMatcher.find()) {
+ if (!startMatcher.find(start)) {
return;
}
- text = text.substring(startMatcher.end());
+ start = startMatcher.end();
final String heading = startMatcher.group();
- for (final Selector selector : currentSelectors) {
- if (heading.indexOf("Translingual") != -1) {
- // Find end.
- final int depth = startMatcher.group(1).length();
- final Pattern endPattern = getEndPattern(depth);
- final Matcher endMatcher = endPattern.matcher(text);
- if (endMatcher.find()) {
- int end = endMatcher.start();
- translingual = text.substring(0, endMatcher.start());
- text = text.substring(end);
- break;
- }
+ // For Translingual entries just store the text for later
+ // use in the per-language sections
+ if (heading.indexOf("Translingual") != -1) {
+ // Find end.
+ final int depth = startMatcher.group(1).length();
+ final Pattern endPattern = getEndPattern(depth);
+
+ final Matcher endMatcher = endPattern.matcher(text);
+ if (endMatcher.find(start)) {
+ int end = endMatcher.start();
+ translingual = text.substring(start, end);
+ start = end;
+ continue;
}
- if (selector.pattern.matcher(heading).find()) {
+ }
+ for (final Selector selector : currentSelectors) {
+ if (selector.pattern.matcher(heading).find()) {
// Find end.
final int depth = startMatcher.group(1).length();
final Pattern endPattern = getEndPattern(depth);
final Matcher endMatcher = endPattern.matcher(text);
final int end;
- if (endMatcher.find()) {
+ if (endMatcher.find(start)) {
end = endMatcher.start();
} else {
end = text.length();
}
- String sectionText = text.substring(0, end);
+ String sectionText = text.substring(start, end);
// Hack to remove empty dummy section from French
if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) {
int dummy_end = sectionText.indexOf("}}", 41) + 2;
throw new RuntimeException(e);
}
- text = text.substring(end);
+ start = end;
break;
}
}
public void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(file, this);
}