X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=d39e6c3626e9993ec4ded178c93b0487197f40ae;hb=2357e7d97f1efe9d3527d5a73d470fe9f518786e;hp=37344cae59362fa1e5da2342bb36e2e98f708724;hpb=eec1a89b6cdffe7048aefa3cb2b3497b1744be99;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 37344ca..d39e6c3 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -105,7 +105,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey()); throw e; } @@ -152,6 +152,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Template:") || title.startsWith("Summary:") || title.startsWith("Module:") || + title.startsWith("Reconstruction:") || // DE title.startsWith("Datei:") || title.startsWith("Verzeichnis:") || @@ -160,6 +161,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Kategorie:") || title.startsWith("Hilfe:") || title.startsWith("Reim:") || + title.startsWith("Modul:") || // FR: title.startsWith("Annexe:") || title.startsWith("Catégori:") || @@ -169,16 +171,20 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Aide:") || title.startsWith("Fichier:") || title.startsWith("Wiktionnaire:") || + title.startsWith("Translations:Wiktionnaire:") || + title.startsWith("Translations:Projet:") || title.startsWith("Catégorie:") || title.startsWith("Portail:") || title.startsWith("utiliusateur:") || title.startsWith("Kategorio:") || + title.startsWith("Tutoriel:") || // IT title.startsWith("Wikizionario:") || title.startsWith("Appendice:") || title.startsWith("Categoria:") || title.startsWith("Aiuto:") || title.startsWith("Portail:") || + title.startsWith("Modulo:") || // ES title.startsWith("Apéndice:") || title.startsWith("Archivo:") || @@ -187,15 +193,28 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Plantilla:") || title.startsWith("Wikcionario:") || + // PT + title.startsWith("Ajuda:") || + title.startsWith("Apêndice:") || + title.startsWith("Citações:") || + title.startsWith("Portal:") || + title.startsWith("Predefinição:") || + title.startsWith("Vocabulário:") || + title.startsWith("Wikcionário:") || + title.startsWith("Módulo:") || + // sentinel false ) return; - if (!title.startsWith("Sign gloss:")) { + // leave the Flexion: pages in for now and do not warn about them + if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) { System.err.println("title with colon: " + title); } } String text = textBuilder.toString(); + // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns + text = text.replaceAll("\\{\\{ES(\\|[^{}=]*)?}}", "== {{lengua|es}} =="); String translingual = ""; int start = 0; final Matcher startMatcher = headingStart.matcher(text);