X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=9d51b7841fd07fc851237035c4a0574d6f250396;hb=6bdac65cd6138f8660de4f248b32c35fcd748ae7;hp=cc9a3b1a0bc6ee56110dfded92cec93a50b83301;hpb=b7b04d01f8d0ed763f0817d0531ecebf9ff50260;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index cc9a3b1..9d51b78 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -23,10 +23,13 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -40,15 +43,16 @@ import org.xml.sax.SAXException; import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; -public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { +public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable { // The matches the whole line, otherwise regexes don't work well on French: // {{=uk=}} // Spanish has no initial headings, tried to also detect {{ES as such // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. - static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE); + static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}"); - final Map> pathToSelectors = new LinkedHashMap>(); + final Map.Entry> pathToSelectorsEntry; List currentSelectors = null; StringBuilder titleBuilder; @@ -56,15 +60,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { StringBuilder currentBuilder = null; public static void main(final String[] args) throws Exception { - final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - wiktionarySplitter.go(); + boolean parallel = args.length > 0 && args[0].equals("parallel"); + final ExecutorService e = Executors.newCachedThreadPool(); + final Map> pathToSelectors = createSelectorsMap(); + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry); + if (parallel) { + e.submit(wiktionarySplitter); + } else wiktionarySplitter.go(); + } + e.shutdown(); + } + + private WiktionarySplitter(final Map.Entry> pathToSelectorsEntry) { + this.pathToSelectorsEntry = pathToSelectorsEntry; } - private WiktionarySplitter() { + private static Map> createSelectorsMap() { + final Map> pathToSelectors = new LinkedHashMap<>(); List selectors; for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { //if (!code.equals("fr")) {continue;} - selectors = new ArrayList(); + selectors = new ArrayList<>(); pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { final String dir = String.format("data/inputs/wikiSplit/%s", code); @@ -72,13 +89,22 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); } } + return pathToSelectors; + } + + @Override + public void run() { + try { + go(); + } catch (Exception e) { + throw new RuntimeException(e); + } } private void go() throws Exception { final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); // Configure things. - for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { currentSelectors = pathToSelectorsEntry.getValue(); @@ -86,7 +112,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz"); tmp = new BufferedOutputStream(tmp); tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp); - tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + tmp = new WriteBuffer(tmp, 1024 * 1024); selector.out = new DataOutputStream(tmp); } @@ -105,7 +131,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey()); + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey()); throw e; } @@ -113,17 +139,15 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { for (final Selector selector : currentSelectors) { selector.out.close(); } - - } } String lastPageTitle = null; int pageCount = 0; - Pattern endPatterns[] = new Pattern[100]; + final Matcher[] endPatterns = new Matcher[100]; - private Pattern getEndPattern(int depth) { + private Matcher getEndPattern(int depth) { if (endPatterns[depth] == null) - endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher(""); return endPatterns[depth]; } @@ -214,28 +238,27 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { String text = textBuilder.toString(); // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns - text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} =="); + text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} =="); String translingual = ""; int start = 0; - final Matcher startMatcher = headingStart.matcher(text); + Matcher headingStart = headingStartPattern.matcher(text); while (start < text.length()) { // Find start. - if (!startMatcher.find(start)) { + if (!headingStart.find(start)) { return; } - start = startMatcher.end(); + start = headingStart.end(); - final String heading = startMatcher.group(); + final String heading = headingStart.group(); // For Translingual entries just store the text for later // use in the per-language sections - if (heading.indexOf("Translingual") != -1) { + if (heading.contains("Translingual")) { // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = getEndPattern(depth); + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); - final Matcher endMatcher = endPattern.matcher(text); if (endMatcher.find(start)) { int end = endMatcher.start(); translingual = text.substring(start, end); @@ -245,12 +268,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } for (final Selector selector : currentSelectors) { - if (selector.pattern.matcher(heading).find()) { + if (selector.pattern.reset(heading).find()) { // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = getEndPattern(depth); + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); - final Matcher endMatcher = endPattern.matcher(text); final int end; if (endMatcher.find(start)) { end = endMatcher.start(); @@ -267,13 +289,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; sectionText = sectionText.substring(dummy_end); } - if (heading.indexOf("Japanese") == -1) sectionText += translingual; + if (!heading.contains("Japanese")) sectionText += translingual; final Section section = new Section(title, heading, sectionText); try { selector.out.writeUTF(section.title); selector.out.writeUTF(section.heading); - final byte[] bytes = section.text.getBytes("UTF8"); + final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8); selector.out.writeInt(bytes.length); selector.out.write(bytes); } catch (IOException e) { @@ -308,13 +330,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { static class Selector { final String outFilename; - final Pattern pattern; + final Matcher pattern; DataOutputStream out; public Selector(final String filename, final String pattern) { this.outFilename = filename; - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(""); } } @@ -337,15 +359,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } @Override - public void characters(char[] ch, int start, int length) throws SAXException { + public void characters(char[] ch, int start, int length) { if (currentBuilder != null) { currentBuilder.append(ch, start, length); } } @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { + public void endElement(String uri, String localName, String qName) { currentBuilder = null; if ("page".equals(qName)) { endPage();