import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
// {{=uk=}}
// Spanish has no initial headings, tried to also detect {{ES as such
// with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
- static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+ static final Matcher headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE).matcher("");
+ static final Matcher startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}").matcher("");
- final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+ final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
List<Selector> currentSelectors = null;
StringBuilder titleBuilder;
List<Selector> selectors;
for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
//if (!code.equals("fr")) {continue;}
- selectors = new ArrayList<WiktionarySplitter.Selector>();
+ selectors = new ArrayList<>();
pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
final String dir = String.format("data/inputs/wikiSplit/%s", code);
}
private void go() throws Exception {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
// Configure things.
for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
parser.parse(new BufferedInputStream(in), this);
}
} catch (Exception e) {
- System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+ System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
throw e;
}
String lastPageTitle = null;
int pageCount = 0;
- Pattern endPatterns[] = new Pattern[100];
+ final Matcher[] endPatterns = new Matcher[100];
- private Pattern getEndPattern(int depth) {
+ private Matcher getEndPattern(int depth) {
if (endPatterns[depth] == null)
- endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+ endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher("");
return endPatterns[depth];
}
private void endPage() {
final String title = titleBuilder.toString();
lastPageTitle = title;
- if (++pageCount % 1000 == 0) {
+ if (++pageCount % 100000 == 0) {
System.out.println("endPage: " + title + ", count=" + pageCount);
}
- if (title.startsWith("Wiktionary:") ||
+ if (title.startsWith("Unsupported titles/")) return;
+ if (title.contains(":")) {
+ if (title.startsWith("Wiktionary:") ||
title.startsWith("Appendix:") ||
title.startsWith("Help:") ||
title.startsWith("Index:") ||
title.startsWith("Rhymes:") ||
title.startsWith("Category:") ||
title.startsWith("Wikisaurus:") ||
- title.startsWith("Unsupported titles/") ||
title.startsWith("Transwiki:") ||
title.startsWith("File:") ||
title.startsWith("Thread:") ||
title.startsWith("Template:") ||
title.startsWith("Summary:") ||
title.startsWith("Module:") ||
+ title.startsWith("Reconstruction:") ||
// DE
title.startsWith("Datei:") ||
title.startsWith("Verzeichnis:") ||
title.startsWith("Kategorie:") ||
title.startsWith("Hilfe:") ||
title.startsWith("Reim:") ||
+ title.startsWith("Modul:") ||
// FR:
title.startsWith("Annexe:") ||
title.startsWith("Catégori:") ||
title.startsWith("Aide:") ||
title.startsWith("Fichier:") ||
title.startsWith("Wiktionnaire:") ||
+ title.startsWith("Translations:Wiktionnaire:") ||
+ title.startsWith("Translations:Projet:") ||
title.startsWith("Catégorie:") ||
title.startsWith("Portail:") ||
title.startsWith("utiliusateur:") ||
title.startsWith("Kategorio:") ||
+ title.startsWith("Tutoriel:") ||
// IT
title.startsWith("Wikizionario:") ||
title.startsWith("Appendice:") ||
title.startsWith("Categoria:") ||
title.startsWith("Aiuto:") ||
title.startsWith("Portail:") ||
+ title.startsWith("Modulo:") ||
// ES
title.startsWith("Apéndice:") ||
title.startsWith("Archivo:") ||
title.startsWith("Plantilla:") ||
title.startsWith("Wikcionario:") ||
+ // PT
+ title.startsWith("Ajuda:") ||
+ title.startsWith("Apêndice:") ||
+ title.startsWith("Citações:") ||
+ title.startsWith("Portal:") ||
+ title.startsWith("Predefinição:") ||
+ title.startsWith("Vocabulário:") ||
+ title.startsWith("Wikcionário:") ||
+ title.startsWith("Módulo:") ||
+
// sentinel
false
- ) {
- return;
- }
- if (title.contains(":")) {
- if (!title.startsWith("Sign gloss:")) {
+ ) return;
+ // leave the Flexion: pages in for now and do not warn about them
+ if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
System.err.println("title with colon: " + title);
}
}
String text = textBuilder.toString();
+ // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
+ text = startSpanish.reset(text).replaceAll("== {{lengua|es}} ==");
String translingual = "";
int start = 0;
- final Matcher startMatcher = headingStart.matcher(text);
+ headingStart.reset(text);
while (start < text.length()) {
// Find start.
- if (!startMatcher.find(start)) {
+ if (!headingStart.find(start)) {
return;
}
- start = startMatcher.end();
+ start = headingStart.end();
- final String heading = startMatcher.group();
+ final String heading = headingStart.group();
// For Translingual entries just store the text for later
// use in the per-language sections
- if (heading.indexOf("Translingual") != -1) {
+ if (heading.contains("Translingual")) {
// Find end.
- final int depth = startMatcher.group(1).length();
- final Pattern endPattern = getEndPattern(depth);
+ final int depth = headingStart.group(1).length();
+ final Matcher endMatcher = getEndPattern(depth).reset(text);
- final Matcher endMatcher = endPattern.matcher(text);
if (endMatcher.find(start)) {
int end = endMatcher.start();
translingual = text.substring(start, end);
}
for (final Selector selector : currentSelectors) {
- if (selector.pattern.matcher(heading).find()) {
+ if (selector.pattern.reset(heading).find()) {
// Find end.
- final int depth = startMatcher.group(1).length();
- final Pattern endPattern = getEndPattern(depth);
+ final int depth = headingStart.group(1).length();
+ final Matcher endMatcher = getEndPattern(depth).reset(text);
- final Matcher endMatcher = endPattern.matcher(text);
final int end;
if (endMatcher.find(start)) {
end = endMatcher.start();
sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
sectionText = sectionText.substring(dummy_end);
}
- if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+ if (!heading.contains("Japanese")) sectionText += translingual;
final Section section = new Section(title, heading, sectionText);
try {
selector.out.writeUTF(section.title);
selector.out.writeUTF(section.heading);
- final byte[] bytes = section.text.getBytes("UTF8");
+ final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
selector.out.writeInt(bytes.length);
selector.out.write(bytes);
} catch (IOException e) {
static class Selector {
final String outFilename;
- final Pattern pattern;
+ final Matcher pattern;
DataOutputStream out;
public Selector(final String filename, final String pattern) {
this.outFilename = filename;
- this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+ this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher("");
}
}
}
@Override
- public void characters(char[] ch, int start, int length) throws SAXException {
+ public void characters(char[] ch, int start, int length) {
if (currentBuilder != null) {
currentBuilder.append(ch, start, length);
}
}
@Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
+ public void endElement(String uri, String localName, String qName) {
currentBuilder = null;
if ("page".equals(qName)) {
endPage();
public void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(file, this);
}