]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
Improvements to wikisplit code.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index 34cf2d7436d36ffc99b315f232d81cb283ac1ff5..42f7a724d478eab03bf6fd3546b1b00f8642e347 100644 (file)
@@ -39,6 +39,8 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
 
   // The matches the whole line, otherwise regexes don't work well on French:
   // {{=uk=}}
+  // Spanish has no initial headings, tried to also detect {{ES as such
+  // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
   static final Pattern headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
   
   final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<String, List<Selector>>();
@@ -103,6 +105,69 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
     if (++pageCount % 1000 == 0) {
       System.out.println("endPage: " + title + ", count=" + pageCount);
     }
+    if (title.startsWith("Wiktionary:") || 
+            title.startsWith("Appendix:") || 
+            title.startsWith("Help:") ||
+            title.startsWith("Index:") ||
+            title.startsWith("MediaWiki:") || 
+            title.startsWith("Citations:") || 
+            title.startsWith("Concordance:") || 
+            title.startsWith("Glossary:") || 
+            title.startsWith("Rhymes:") || 
+            title.startsWith("Category:") || 
+            title.startsWith("Wikisaurus:") || 
+            title.startsWith("Unsupported titles/") || 
+            title.startsWith("Transwiki:") || 
+            title.startsWith("File:") || 
+            title.startsWith("Thread:") || 
+            title.startsWith("Template:") ||
+            title.startsWith("Summary:") ||
+            title.startsWith("Module:") ||
+            // DE
+            title.startsWith("Datei:") ||
+            title.startsWith("Verzeichnis:") ||
+            title.startsWith("Vorlage:") ||
+            title.startsWith("Thesaurus:") ||
+            title.startsWith("Kategorie:") ||
+            title.startsWith("Hilfe:") ||
+            title.startsWith("Reim:") ||
+            // FR:
+            title.startsWith("Annexe:") ||
+            title.startsWith("Catégori:") ||
+            title.startsWith("Modèle:") ||
+            title.startsWith("Thésaurus:") ||
+            title.startsWith("Projet:") ||
+            title.startsWith("Aide:") ||
+            title.startsWith("Fichier:") ||
+            title.startsWith("Wiktionnaire:") ||
+            title.startsWith("Catégorie:") ||
+            title.startsWith("Portail:") ||
+            title.startsWith("utiliusateur:") ||
+            title.startsWith("Kategorio:") ||
+            // IT
+            title.startsWith("Wikizionario:") ||
+            title.startsWith("Appendice:") ||
+            title.startsWith("Categoria:") ||
+            title.startsWith("Aiuto:") ||
+            title.startsWith("Portail:") ||
+            // ES
+            title.startsWith("Apéndice:") ||
+            title.startsWith("Archivo:") ||
+            title.startsWith("Ayuda:") ||
+            title.startsWith("Categoría:") ||
+            title.startsWith("Plantilla:") ||
+            title.startsWith("Wikcionario:") ||
+
+            // sentinel
+            false
+            ) {
+        return;
+    }
+    if (title.contains(":")) {
+        if (!title.startsWith("Sign gloss:")) {
+            System.err.println("title with colon: " + title);
+        }
+    }
     
     String text = textBuilder.toString();
     
@@ -130,7 +195,16 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
             end = text.length();
           }
           
-          final String sectionText = text.substring(0, end);
+          String sectionText = text.substring(0, end);
+          // Hack to remove empty dummy section from French
+          if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym"))
+          {
+              int dummy_end = sectionText.indexOf("}}", 41) + 2;
+              while (dummy_end + 1 < sectionText.length() &&
+                     sectionText.charAt(dummy_end) == '\n' &&
+                     sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
+              sectionText = sectionText.substring(dummy_end);
+          }
           final Section section = new Section(title, heading, sectionText);
           
           try {
@@ -214,13 +288,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
       }
     }
     
-
     public void parse(final File file) throws ParserConfigurationException,
         SAXException, IOException {
       final SAXParser parser = SAXParserFactoryImpl.newInstance().newSAXParser();
       parser.parse(file, this);
     }
-
-    
     
 }