]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
WiktionarySplitter: implement parallel processing
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index ed17fb5bf55c4f72f3bcc4dfe5d30f9a37cb98b4..9d51b7841fd07fc851237035c4a0574d6f250396 100644 (file)
@@ -23,10 +23,13 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -40,16 +43,16 @@ import org.xml.sax.SAXException;
 
 import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
 
-public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
+public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable {
 
     // The matches the whole line, otherwise regexes don't work well on French:
     // {{=uk=}}
     // Spanish has no initial headings, tried to also detect {{ES as such
     // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
-    static final Matcher headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE).matcher("");
-    static final Matcher startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}").matcher("");
+    static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+    static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}");
 
-    final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+    final Map.Entry<String, List<Selector>> pathToSelectorsEntry;
     List<Selector> currentSelectors = null;
 
     StringBuilder titleBuilder;
@@ -57,15 +60,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     StringBuilder currentBuilder = null;
 
     public static void main(final String[] args) throws Exception {
-        final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter();
-        wiktionarySplitter.go();
+        boolean parallel = args.length > 0 && args[0].equals("parallel");
+        final ExecutorService e = Executors.newCachedThreadPool();
+        final Map<String,List<Selector>> pathToSelectors = createSelectorsMap();
+        for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
+            final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry);
+            if (parallel) {
+                e.submit(wiktionarySplitter);
+            } else wiktionarySplitter.go();
+        }
+        e.shutdown();
+    }
+
+    private WiktionarySplitter(final Map.Entry<String, List<Selector>> pathToSelectorsEntry) {
+        this.pathToSelectorsEntry = pathToSelectorsEntry;
     }
 
-    private WiktionarySplitter() {
+    private static Map<String,List<Selector>> createSelectorsMap() {
+        final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
         List<Selector> selectors;
         for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
             //if (!code.equals("fr")) {continue;}
-            selectors = new ArrayList<WiktionarySplitter.Selector>();
+            selectors = new ArrayList<>();
             pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
             for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
                 final String dir = String.format("data/inputs/wikiSplit/%s", code);
@@ -73,13 +89,22 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
             }
         }
+        return pathToSelectors;
+    }
+
+    @Override
+    public void run() {
+        try {
+            go();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
     }
 
     private void go() throws Exception {
         final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
 
         // Configure things.
-        for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
 
             currentSelectors = pathToSelectorsEntry.getValue();
 
@@ -87,7 +112,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
                 tmp = new BufferedOutputStream(tmp);
                 tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
-                tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+                tmp = new WriteBuffer(tmp, 1024 * 1024);
                 selector.out = new DataOutputStream(tmp);
             }
 
@@ -106,7 +131,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                     parser.parse(new BufferedInputStream(in), this);
                 }
             } catch (Exception e) {
-                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
                 throw e;
             }
 
@@ -114,13 +139,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
             for (final Selector selector : currentSelectors) {
                 selector.out.close();
             }
-
-        }
     }
 
     String lastPageTitle = null;
     int pageCount = 0;
-    Matcher endPatterns[] = new Matcher[100];
+    final Matcher[] endPatterns = new Matcher[100];
 
     private Matcher getEndPattern(int depth) {
         if (endPatterns[depth] == null)
@@ -215,10 +238,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
         String text = textBuilder.toString();
         // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
-        text = startSpanish.reset(text).replaceAll("== {{lengua|es}} ==");
+        text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} ==");
         String translingual = "";
         int start = 0;
-        headingStart.reset(text);
+        Matcher headingStart = headingStartPattern.matcher(text);
 
         while (start < text.length()) {
             // Find start.
@@ -231,7 +254,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
             // For Translingual entries just store the text for later
             // use in the per-language sections
-            if (heading.indexOf("Translingual") != -1) {
+            if (heading.contains("Translingual")) {
                 // Find end.
                 final int depth = headingStart.group(1).length();
                 final Matcher endMatcher = getEndPattern(depth).reset(text);
@@ -266,13 +289,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                                 sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
                         sectionText = sectionText.substring(dummy_end);
                     }
-                    if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+                    if (!heading.contains("Japanese")) sectionText += translingual;
                     final Section section = new Section(title, heading, sectionText);
 
                     try {
                         selector.out.writeUTF(section.title);
                         selector.out.writeUTF(section.heading);
-                        final byte[] bytes = section.text.getBytes("UTF8");
+                        final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
                         selector.out.writeInt(bytes.length);
                         selector.out.write(bytes);
                     } catch (IOException e) {
@@ -336,15 +359,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     }
 
     @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
+    public void characters(char[] ch, int start, int length) {
         if (currentBuilder != null) {
             currentBuilder.append(ch, start, length);
         }
     }
 
     @Override
-    public void endElement(String uri, String localName, String qName)
-    throws SAXException {
+    public void endElement(String uri, String localName, String qName) {
         currentBuilder = null;
         if ("page".equals(qName)) {
             endPage();