import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
-public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
+public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable {
// The matches the whole line, otherwise regexes don't work well on French:
// {{=uk=}}
// Spanish has no initial headings, tried to also detect {{ES as such
// with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
- static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+ static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+ static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}");
- final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+ final Map.Entry<String, List<Selector>> pathToSelectorsEntry;
List<Selector> currentSelectors = null;
StringBuilder titleBuilder;
StringBuilder currentBuilder = null;
public static void main(final String[] args) throws Exception {
- final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
- wiktionarySplitter.go();
+ boolean parallel = args.length > 0 && args[0].equals("parallel");
+ final ExecutorService e = Executors.newCachedThreadPool();
+ final Map<String,List<Selector>> pathToSelectors = createSelectorsMap();
+ for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+ final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry);
+ if (parallel) {
+ e.submit(wiktionarySplitter);
+ } else wiktionarySplitter.go();
+ }
+ e.shutdown();
}
- private WiktionarySplitter() {
+ private WiktionarySplitter(final Map.Entry<String, List<Selector>> pathToSelectorsEntry) {
+ this.pathToSelectorsEntry = pathToSelectorsEntry;
+ }
+
+ private static Map<String,List<Selector>> createSelectorsMap() {
+ final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
List<Selector> selectors;
for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
- //if (!code.equals("fr")) {continue;}
- selectors = new ArrayList<WiktionarySplitter.Selector>();
+ if (!code.equals("pt")) {continue;}
+ selectors = new ArrayList<>();
pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
final String dir = String.format("data/inputs/wikiSplit/%s", code);
selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
}
}
+ return pathToSelectors;
+ }
+
+ @Override
+ public void run() {
+ try {
+ go();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
}
private void go() throws Exception {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
// Configure things.
- for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
currentSelectors = pathToSelectorsEntry.getValue();
OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
tmp = new BufferedOutputStream(tmp);
tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
- tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+ tmp = new WriteBuffer(tmp, 1024 * 1024);
selector.out = new DataOutputStream(tmp);
}
parser.parse(new BufferedInputStream(in), this);
}
} catch (Exception e) {
- System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+ System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
throw e;
}
for (final Selector selector : currentSelectors) {
selector.out.close();
}
-
- }
}
String lastPageTitle = null;
int pageCount = 0;
- Pattern endPatterns[] = new Pattern[100];
+ final Matcher[] endPatterns = new Matcher[100];
- private Pattern getEndPattern(int depth) {
+ private Matcher getEndPattern(int depth) {
if (endPatterns[depth] == null)
- endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+ endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher("");
return endPatterns[depth];
}
title.startsWith("Template:") ||
title.startsWith("Summary:") ||
title.startsWith("Module:") ||
+ title.startsWith("Reconstruction:") ||
// DE
title.startsWith("Datei:") ||
title.startsWith("Verzeichnis:") ||
title.startsWith("Kategorie:") ||
title.startsWith("Hilfe:") ||
title.startsWith("Reim:") ||
+ title.startsWith("Modul:") ||
// FR:
title.startsWith("Annexe:") ||
title.startsWith("Catégori:") ||
title.startsWith("Aide:") ||
title.startsWith("Fichier:") ||
title.startsWith("Wiktionnaire:") ||
+ title.startsWith("Translations:Wiktionnaire:") ||
+ title.startsWith("Translations:Projet:") ||
title.startsWith("Catégorie:") ||
title.startsWith("Portail:") ||
title.startsWith("utiliusateur:") ||
title.startsWith("Kategorio:") ||
+ title.startsWith("Tutoriel:") ||
// IT
title.startsWith("Wikizionario:") ||
title.startsWith("Appendice:") ||
title.startsWith("Categoria:") ||
title.startsWith("Aiuto:") ||
title.startsWith("Portail:") ||
+ title.startsWith("Modulo:") ||
// ES
title.startsWith("Apéndice:") ||
title.startsWith("Archivo:") ||
title.startsWith("Plantilla:") ||
title.startsWith("Wikcionario:") ||
+ // PT
+ title.startsWith("Ajuda:") ||
+ title.startsWith("Apêndice:") ||
+ title.startsWith("Citações:") ||
+ title.startsWith("Portal:") ||
+ title.startsWith("Predefinição:") ||
+ title.startsWith("Vocabulário:") ||
+ title.startsWith("Wikcionário:") ||
+ title.startsWith("Módulo:") ||
+
// sentinel
false
) return;
- if (!title.startsWith("Sign gloss:")) {
+ // leave the Flexion: pages in for now and do not warn about them
+ if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
System.err.println("title with colon: " + title);
}
}
String text = textBuilder.toString();
+ // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
+ text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} ==");
String translingual = "";
int start = 0;
- final Matcher startMatcher = headingStart.matcher(text);
+ Matcher headingStart = headingStartPattern.matcher(text);
while (start < text.length()) {
// Find start.
- if (!startMatcher.find(start)) {
+ if (!headingStart.find(start)) {
return;
}
- start = startMatcher.end();
+ start = headingStart.end();
- final String heading = startMatcher.group();
+ final String heading = headingStart.group();
// For Translingual entries just store the text for later
// use in the per-language sections
- if (heading.indexOf("Translingual") != -1) {
+ if (heading.contains("Translingual")) {
// Find end.
- final int depth = startMatcher.group(1).length();
- final Pattern endPattern = getEndPattern(depth);
+ final int depth = headingStart.group(1).length();
+ final Matcher endMatcher = getEndPattern(depth).reset(text);
- final Matcher endMatcher = endPattern.matcher(text);
if (endMatcher.find(start)) {
int end = endMatcher.start();
translingual = text.substring(start, end);
}
for (final Selector selector : currentSelectors) {
- if (selector.pattern.matcher(heading).find()) {
+ if (selector.pattern.reset(heading).find()) {
// Find end.
- final int depth = startMatcher.group(1).length();
- final Pattern endPattern = getEndPattern(depth);
+ final int depth = headingStart.group(1).length();
+ final Matcher endMatcher = getEndPattern(depth).reset(text);
- final Matcher endMatcher = endPattern.matcher(text);
final int end;
if (endMatcher.find(start)) {
end = endMatcher.start();
sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
sectionText = sectionText.substring(dummy_end);
}
- if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+ if (!heading.contains("Japanese")) sectionText += translingual;
final Section section = new Section(title, heading, sectionText);
try {
selector.out.writeUTF(section.title);
selector.out.writeUTF(section.heading);
- final byte[] bytes = section.text.getBytes("UTF8");
+ final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
selector.out.writeInt(bytes.length);
selector.out.write(bytes);
} catch (IOException e) {
static class Selector {
final String outFilename;
- final Pattern pattern;
+ final Matcher pattern;
DataOutputStream out;
public Selector(final String filename, final String pattern) {
this.outFilename = filename;
- this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+ this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher("");
}
}
}
@Override
- public void characters(char[] ch, int start, int length) throws SAXException {
+ public void characters(char[] ch, int start, int length) {
if (currentBuilder != null) {
currentBuilder.append(ch, start, length);
}
}
@Override
- public void endElement(String uri, String localName, String qName)
- throws SAXException {
+ public void endElement(String uri, String localName, String qName) {
currentBuilder = null;
if ("page".equals(qName)) {
endPage();
public void parse(final File file) throws ParserConfigurationException,
SAXException, IOException {
- final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+ final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(file, this);
}