X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=b148ebb2d70d0c9b9b91c6272f030cf579cb3576;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=290a58fccc1e38a6c36acdd44f2cb08cf42abb40;hpb=ee1dbfb669462305a1c07e4d804a90af79f5d39f;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 290a58f..b148ebb 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -20,8 +20,10 @@ import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -31,8 +33,8 @@ import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; -import org.apache.xerces.jaxp.SAXParserFactoryImpl; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -45,9 +47,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { // {{=uk=}} // Spanish has no initial headings, tried to also detect {{ES as such // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. - static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + static final Matcher headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE).matcher(""); + static final Matcher startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}").matcher(""); - final Map> pathToSelectors = new LinkedHashMap>(); + final Map> pathToSelectors = new LinkedHashMap<>(); List currentSelectors = null; StringBuilder titleBuilder; @@ -63,7 +66,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { //if (!code.equals("fr")) {continue;} - selectors = new ArrayList(); + selectors = new ArrayList<>(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { final String dir = String.format("data/inputs/wikiSplit/%s", code); @@ -74,7 +77,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } private void go() throws Exception { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); // Configure things. for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { @@ -82,7 +85,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { currentSelectors = pathToSelectorsEntry.getValue(); for (final Selector selector : currentSelectors) { - selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); + OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz"); + tmp = new BufferedOutputStream(tmp); + tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp); + tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + selector.out = new DataOutputStream(tmp); } // Do it. @@ -96,10 +103,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } else { InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey()); throw e; } @@ -113,13 +121,23 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { String lastPageTitle = null; int pageCount = 0; + final Matcher[] endPatterns = new Matcher[100]; + + private Matcher getEndPattern(int depth) { + if (endPatterns[depth] == null) + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher(""); + return endPatterns[depth]; + } + private void endPage() { final String title = titleBuilder.toString(); lastPageTitle = title; - if (++pageCount % 1000 == 0) { + if (++pageCount % 100000 == 0) { System.out.println("endPage: " + title + ", count=" + pageCount); } - if (title.startsWith("Wiktionary:") || + if (title.startsWith("Unsupported titles/")) return; + if (title.contains(":")) { + if (title.startsWith("Wiktionary:") || title.startsWith("Appendix:") || title.startsWith("Help:") || title.startsWith("Index:") || @@ -130,13 +148,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Rhymes:") || title.startsWith("Category:") || title.startsWith("Wikisaurus:") || - title.startsWith("Unsupported titles/") || title.startsWith("Transwiki:") || title.startsWith("File:") || title.startsWith("Thread:") || title.startsWith("Template:") || title.startsWith("Summary:") || title.startsWith("Module:") || + title.startsWith("Reconstruction:") || // DE title.startsWith("Datei:") || title.startsWith("Verzeichnis:") || @@ -145,6 +163,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Kategorie:") || title.startsWith("Hilfe:") || title.startsWith("Reim:") || + title.startsWith("Modul:") || // FR: title.startsWith("Annexe:") || title.startsWith("Catégori:") || @@ -154,16 +173,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Aide:") || title.startsWith("Fichier:") || title.startsWith("Wiktionnaire:") || + title.startsWith("Translations:Wiktionnaire:") || + title.startsWith("Translations:Projet:") || title.startsWith("Catégorie:") || title.startsWith("Portail:") || title.startsWith("utiliusateur:") || title.startsWith("Kategorio:") || + title.startsWith("Tutoriel:") || // IT title.startsWith("Wikizionario:") || title.startsWith("Appendice:") || title.startsWith("Categoria:") || title.startsWith("Aiuto:") || title.startsWith("Portail:") || + title.startsWith("Modulo:") || // ES title.startsWith("Apéndice:") || title.startsWith("Archivo:") || @@ -172,58 +195,70 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Plantilla:") || title.startsWith("Wikcionario:") || + // PT + title.startsWith("Ajuda:") || + title.startsWith("Apêndice:") || + title.startsWith("Citações:") || + title.startsWith("Portal:") || + title.startsWith("Predefinição:") || + title.startsWith("Vocabulário:") || + title.startsWith("Wikcionário:") || + title.startsWith("Módulo:") || + // sentinel false - ) { - return; - } - if (title.contains(":")) { - if (!title.startsWith("Sign gloss:")) { + ) return; + // leave the Flexion: pages in for now and do not warn about them + if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) { System.err.println("title with colon: " + title); } } String text = textBuilder.toString(); + // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns + text = startSpanish.reset(text).replaceAll("== {{lengua|es}} =="); String translingual = ""; + int start = 0; + headingStart.reset(text); - while (text.length() > 0) { + while (start < text.length()) { // Find start. - final Matcher startMatcher = headingStart.matcher(text); - if (!startMatcher.find()) { + if (!headingStart.find(start)) { return; } - text = text.substring(startMatcher.end()); - - final String heading = startMatcher.group(); - for (final Selector selector : currentSelectors) { - if (heading.indexOf("Translingual") != -1) { - // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); - - final Matcher endMatcher = endPattern.matcher(text); - if (endMatcher.find()) { - int end = endMatcher.start(); - translingual = text.substring(0, endMatcher.start()); - text = text.substring(end); - break; - } + start = headingStart.end(); + + final String heading = headingStart.group(); + + // For Translingual entries just store the text for later + // use in the per-language sections + if (heading.contains("Translingual")) { + // Find end. + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); + + if (endMatcher.find(start)) { + int end = endMatcher.start(); + translingual = text.substring(start, end); + start = end; + continue; } - if (selector.pattern.matcher(heading).find()) { + } + for (final Selector selector : currentSelectors) { + if (selector.pattern.reset(heading).find()) { // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); - final Matcher endMatcher = endPattern.matcher(text); final int end; - if (endMatcher.find()) { + if (endMatcher.find(start)) { end = endMatcher.start(); } else { end = text.length(); } - String sectionText = text.substring(0, end); + String sectionText = text.substring(start, end); // Hack to remove empty dummy section from French if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) { int dummy_end = sectionText.indexOf("}}", 41) + 2; @@ -232,20 +267,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; sectionText = sectionText.substring(dummy_end); } - if (heading.indexOf("Japanese") == -1) sectionText += translingual; + if (!heading.contains("Japanese")) sectionText += translingual; final Section section = new Section(title, heading, sectionText); try { selector.out.writeUTF(section.title); selector.out.writeUTF(section.heading); - final byte[] bytes = section.text.getBytes("UTF8"); + final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8); selector.out.writeInt(bytes.length); selector.out.write(bytes); } catch (IOException e) { throw new RuntimeException(e); } - text = text.substring(end); + start = end; break; } } @@ -273,13 +308,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static class Selector { final String outFilename; - final Pattern pattern; + final Matcher pattern; DataOutputStream out; public Selector(final String filename, final String pattern) { this.outFilename = filename; - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(""); } } @@ -302,15 +337,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } @Override - public void characters(char[] ch, int start, int length) throws SAXException { + public void characters(char[] ch, int start, int length) { if (currentBuilder != null) { currentBuilder.append(ch, start, length); } } @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { + public void endElement(String uri, String localName, String qName) { currentBuilder = null; if ("page".equals(qName)) { endPage(); @@ -319,7 +353,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { - final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser(); + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); parser.parse(file, this); }