]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
Minor automated code simplifications.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index 5935df8f51646f28d6462f9d59bf679ec65cabc5..b148ebb2d70d0c9b9b91c6272f030cf579cb3576 100644 (file)
@@ -20,9 +20,10 @@ import java.io.DataOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
-import java.io.InputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -32,8 +33,8 @@ import java.util.regex.Pattern;
 
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
 
-import org.apache.xerces.jaxp.SAXParserFactoryImpl;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
@@ -46,9 +47,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     // {{=uk=}}
     // Spanish has no initial headings, tried to also detect {{ES as such
     // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
-    static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
+    static final Matcher headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE).matcher("");
+    static final Matcher startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}").matcher("");
 
-    final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
+    final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
     List<Selector> currentSelectors = null;
 
     StringBuilder titleBuilder;
@@ -64,7 +66,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
         List<Selector> selectors;
         for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
             //if (!code.equals("fr")) {continue;}
-            selectors = new ArrayList<WiktionarySplitter.Selector>();
+            selectors = new ArrayList<>();
             pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
             for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
                 final String dir = String.format("data/inputs/wikiSplit/%s", code);
@@ -75,7 +77,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     }
 
     private void go() throws Exception {
-        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+        final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
 
         // Configure things.
         for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
@@ -105,7 +107,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                     parser.parse(new BufferedInputStream(in), this);
                 }
             } catch (Exception e) {
-                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
                 throw e;
             }
 
@@ -119,11 +121,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     String lastPageTitle = null;
     int pageCount = 0;
-    Pattern endPatterns[] = new Pattern[100];
+    final Matcher[] endPatterns = new Matcher[100];
 
-    private Pattern getEndPattern(int depth) {
+    private Matcher getEndPattern(int depth) {
         if (endPatterns[depth] == null)
-            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE);
+            endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher("");
         return endPatterns[depth];
     }
 
@@ -152,6 +154,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Template:") ||
                 title.startsWith("Summary:") ||
                 title.startsWith("Module:") ||
+                title.startsWith("Reconstruction:") ||
                 // DE
                 title.startsWith("Datei:") ||
                 title.startsWith("Verzeichnis:") ||
@@ -160,6 +163,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Kategorie:") ||
                 title.startsWith("Hilfe:") ||
                 title.startsWith("Reim:") ||
+                title.startsWith("Modul:") ||
                 // FR:
                 title.startsWith("Annexe:") ||
                 title.startsWith("Catégori:") ||
@@ -169,16 +173,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Aide:") ||
                 title.startsWith("Fichier:") ||
                 title.startsWith("Wiktionnaire:") ||
+                title.startsWith("Translations:Wiktionnaire:") ||
+                title.startsWith("Translations:Projet:") ||
                 title.startsWith("Catégorie:") ||
                 title.startsWith("Portail:") ||
                 title.startsWith("utiliusateur:") ||
                 title.startsWith("Kategorio:") ||
+                title.startsWith("Tutoriel:") ||
                 // IT
                 title.startsWith("Wikizionario:") ||
                 title.startsWith("Appendice:") ||
                 title.startsWith("Categoria:") ||
                 title.startsWith("Aiuto:") ||
                 title.startsWith("Portail:") ||
+                title.startsWith("Modulo:") ||
                 // ES
                 title.startsWith("Apéndice:") ||
                 title.startsWith("Archivo:") ||
@@ -195,39 +203,40 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Predefinição:") ||
                 title.startsWith("Vocabulário:") ||
                 title.startsWith("Wikcionário:") ||
+                title.startsWith("Módulo:") ||
 
                 // sentinel
                 false
                ) return;
-            if (!title.startsWith("Sign gloss:")) {
+            // leave the Flexion: pages in for now and do not warn about them
+            if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
                 System.err.println("title with colon: " + title);
             }
         }
 
         String text = textBuilder.toString();
         // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
-        text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} ==");
+        text = startSpanish.reset(text).replaceAll("== {{lengua|es}} ==");
         String translingual = "";
         int start = 0;
-        final Matcher startMatcher = headingStart.matcher(text);
+        headingStart.reset(text);
 
         while (start < text.length()) {
             // Find start.
-            if (!startMatcher.find(start)) {
+            if (!headingStart.find(start)) {
                 return;
             }
-            start = startMatcher.end();
+            start = headingStart.end();
 
-            final String heading = startMatcher.group();
+            final String heading = headingStart.group();
 
             // For Translingual entries just store the text for later
             // use in the per-language sections
-            if (heading.indexOf("Translingual") != -1) {
+            if (heading.contains("Translingual")) {
                 // Find end.
-                final int depth = startMatcher.group(1).length();
-                final Pattern endPattern = getEndPattern(depth);
+                final int depth = headingStart.group(1).length();
+                final Matcher endMatcher = getEndPattern(depth).reset(text);
 
-                final Matcher endMatcher = endPattern.matcher(text);
                 if (endMatcher.find(start)) {
                     int end = endMatcher.start();
                     translingual = text.substring(start, end);
@@ -237,12 +246,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
             }
 
             for (final Selector selector : currentSelectors) {
-                if (selector.pattern.matcher(heading).find()) {
+                if (selector.pattern.reset(heading).find()) {
                     // Find end.
-                    final int depth = startMatcher.group(1).length();
-                    final Pattern endPattern = getEndPattern(depth);
+                    final int depth = headingStart.group(1).length();
+                    final Matcher endMatcher = getEndPattern(depth).reset(text);
 
-                    final Matcher endMatcher = endPattern.matcher(text);
                     final int end;
                     if (endMatcher.find(start)) {
                         end = endMatcher.start();
@@ -259,13 +267,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                                 sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
                         sectionText = sectionText.substring(dummy_end);
                     }
-                    if (heading.indexOf("Japanese") == -1) sectionText += translingual;
+                    if (!heading.contains("Japanese")) sectionText += translingual;
                     final Section section = new Section(title, heading, sectionText);
 
                     try {
                         selector.out.writeUTF(section.title);
                         selector.out.writeUTF(section.heading);
-                        final byte[] bytes = section.text.getBytes("UTF8");
+                        final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
                         selector.out.writeInt(bytes.length);
                         selector.out.write(bytes);
                     } catch (IOException e) {
@@ -300,13 +308,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     static class Selector {
         final String outFilename;
-        final Pattern pattern;
+        final Matcher pattern;
 
         DataOutputStream out;
 
         public Selector(final String filename, final String pattern) {
             this.outFilename = filename;
-            this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
+            this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher("");
         }
     }
 
@@ -329,15 +337,14 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     }
 
     @Override
-    public void characters(char[] ch, int start, int length) throws SAXException {
+    public void characters(char[] ch, int start, int length) {
         if (currentBuilder != null) {
             currentBuilder.append(ch, start, length);
         }
     }
 
     @Override
-    public void endElement(String uri, String localName, String qName)
-    throws SAXException {
+    public void endElement(String uri, String localName, String qName) {
         currentBuilder = null;
         if ("page".equals(qName)) {
             endPage();
@@ -346,7 +353,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
     public void parse(final File file) throws ParserConfigurationException,
         SAXException, IOException {
-        final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
+        final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
         parser.parse(file, this);
     }