]> gitweb.fperrin.net Git - DictionaryPC.git/blobdiff - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
Exclude some more special titles not relevant for us.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
index 37344cae59362fa1e5da2342bb36e2e98f708724..d39e6c3626e9993ec4ded178c93b0487197f40ae 100644 (file)
@@ -105,7 +105,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                     parser.parse(new BufferedInputStream(in), this);
                 }
             } catch (Exception e) {
-                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString());
+                System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey());
                 throw e;
             }
 
@@ -152,6 +152,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Template:") ||
                 title.startsWith("Summary:") ||
                 title.startsWith("Module:") ||
+                title.startsWith("Reconstruction:") ||
                 // DE
                 title.startsWith("Datei:") ||
                 title.startsWith("Verzeichnis:") ||
@@ -160,6 +161,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Kategorie:") ||
                 title.startsWith("Hilfe:") ||
                 title.startsWith("Reim:") ||
+                title.startsWith("Modul:") ||
                 // FR:
                 title.startsWith("Annexe:") ||
                 title.startsWith("Catégori:") ||
@@ -169,16 +171,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Aide:") ||
                 title.startsWith("Fichier:") ||
                 title.startsWith("Wiktionnaire:") ||
+                title.startsWith("Translations:Wiktionnaire:") ||
+                title.startsWith("Translations:Projet:") ||
                 title.startsWith("Catégorie:") ||
                 title.startsWith("Portail:") ||
                 title.startsWith("utiliusateur:") ||
                 title.startsWith("Kategorio:") ||
+                title.startsWith("Tutoriel:") ||
                 // IT
                 title.startsWith("Wikizionario:") ||
                 title.startsWith("Appendice:") ||
                 title.startsWith("Categoria:") ||
                 title.startsWith("Aiuto:") ||
                 title.startsWith("Portail:") ||
+                title.startsWith("Modulo:") ||
                 // ES
                 title.startsWith("Apéndice:") ||
                 title.startsWith("Archivo:") ||
@@ -187,15 +193,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler {
                 title.startsWith("Plantilla:") ||
                 title.startsWith("Wikcionario:") ||
 
+                // PT
+                title.startsWith("Ajuda:") ||
+                title.startsWith("Apêndice:") ||
+                title.startsWith("Citações:") ||
+                title.startsWith("Portal:") ||
+                title.startsWith("Predefinição:") ||
+                title.startsWith("Vocabulário:") ||
+                title.startsWith("Wikcionário:") ||
+                title.startsWith("Módulo:") ||
+
                 // sentinel
                 false
                ) return;
-            if (!title.startsWith("Sign gloss:")) {
+            // leave the Flexion: pages in for now and do not warn about them
+            if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
                 System.err.println("title with colon: " + title);
             }
         }
 
         String text = textBuilder.toString();
+        // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
+        text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} ==");
         String translingual = "";
         int start = 0;
         final Matcher startMatcher = headingStart.matcher(text);