]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
Improve wiktionary splitter for Spanish and Portuguese
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index 850dedee2018c7fcf2ea4bb558edefdfb27551a7..435c3f212cd623cf9456054768d6feab355abfe4 100644 (file)
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.InputStream;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -82,7 +83,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
             currentSelectors = pathToSelectorsEntry.getValue();
 
             for (final Selector selector : currentSelectors) {
-                selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename)));
+                OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
+                tmp = new BufferedOutputStream(tmp);
+                tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
+                tmp = new WriteBuffer(tmp, 20 * 1024 * 1024);
+                selector.out = new DataOutputStream(tmp);
             }
 
             // Do it.
@@ -100,7 +105,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                     parser.parse(new BufferedInputStream(in), this);
                 }
             } catch (Exception e) {
-                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
                 throw e;
             }
 
@@ -114,13 +119,23 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     String lastPageTitle = null;
     int pageCount = 0;
+    Pattern endPatterns[] = new Pattern[100];
+
+    private Pattern getEndPattern(int depth) {
+        if (endPatterns[depth] == null)
+            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+        return endPatterns[depth];
+    }
+
     private void endPage() {
         final String title = titleBuilder.toString();
         lastPageTitle = title;
-        if (++pageCount % 1000 == 0) {
+        if (++pageCount % 100000 == 0) {
             System.out.println("endPage: " + title + ", count=" + pageCount);
         }
-        if (title.startsWith("Wiktionary:") ||
+        if (title.startsWith("Unsupported titles/")) return;
+        if (title.contains(":")) {
+            if (title.startsWith("Wiktionary:") ||
                 title.startsWith("Appendix:") ||
                 title.startsWith("Help:") ||
                 title.startsWith("Index:") ||
@@ -131,7 +146,6 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Rhymes:") ||
                 title.startsWith("Category:") ||
                 title.startsWith("Wikisaurus:") ||
-                title.startsWith("Unsupported titles/") ||
                 title.startsWith("Transwiki:") ||
                 title.startsWith("File:") ||
                 title.startsWith("Thread:") ||
@@ -173,58 +187,70 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Plantilla:") ||
                 title.startsWith("Wikcionario:") ||
 
+                // PT
+                title.startsWith("Ajuda:") ||
+                title.startsWith("Apêndice:") ||
+                title.startsWith("Citações:") ||
+                title.startsWith("Portal:") ||
+                title.startsWith("Predefinição:") ||
+                title.startsWith("Vocabulário:") ||
+                title.startsWith("Wikcionário:") ||
+
                 // sentinel
                 false
-           ) {
-            return;
-        }
-        if (title.contains(":")) {
+               ) return;
             if (!title.startsWith("Sign gloss:")) {
                 System.err.println("title with colon: " + title);
             }
         }
 
         String text = textBuilder.toString();
+        // Workaround for Spanish wiktionary {{ES}} pattern
+        text = text.replace("{{ES}}", "== {{lengua|es}} ==");
         String translingual = "";
+        int start = 0;
+        final Matcher startMatcher = headingStart.matcher(text);
 
-        while (text.length() > 0) {
+        while (start < text.length()) {
             // Find start.
-            final Matcher startMatcher = headingStart.matcher(text);
-            if (!startMatcher.find()) {
+            if (!startMatcher.find(start)) {
                 return;
             }
-            text = text.substring(startMatcher.end());
+            start = startMatcher.end();
 
             final String heading = startMatcher.group();
-            for (final Selector selector : currentSelectors) {
-                if (heading.indexOf("Translingual") != -1) {
-                    // Find end.
-                    final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
 
-                    final Matcher endMatcher = endPattern.matcher(text);
-                    if (endMatcher.find()) {
-                        int end = endMatcher.start();
-                        translingual = text.substring(0, endMatcher.start());
-                        text = text.substring(end);
-                        break;
-                    }
+            // For Translingual entries just store the text for later
+            // use in the per-language sections
+            if (heading.indexOf("Translingual") != -1) {
+                // Find end.
+                final int depth = startMatcher.group(1).length();
+                final Pattern endPattern = getEndPattern(depth);
+
+                final Matcher endMatcher = endPattern.matcher(text);
+                if (endMatcher.find(start)) {
+                    int end = endMatcher.start();
+                    translingual = text.substring(start, end);
+                    start = end;
+                    continue;
                 }
-                if (selector.pattern.matcher(heading).find()) {
+            }
 
+            for (final Selector selector : currentSelectors) {
+                if (selector.pattern.matcher(heading).find()) {
                     // Find end.
                     final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+                    final Pattern endPattern = getEndPattern(depth);
 
                     final Matcher endMatcher = endPattern.matcher(text);
                     final int end;
-                    if (endMatcher.find()) {
+                    if (endMatcher.find(start)) {
                         end = endMatcher.start();
                     } else {
                         end = text.length();
                     }
 
-                    String sectionText = text.substring(0, end);
+                    String sectionText = text.substring(start, end);
                     // Hack to remove empty dummy section from French
                     if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) {
                         int dummy_end = sectionText.indexOf("}}", 41) + 2;
@@ -246,7 +272,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                         throw new RuntimeException(e);
                     }
 
-                    text = text.substring(end);
+                    start = end;
                     break;
                 }
             }