X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=435c3f212cd623cf9456054768d6feab355abfe4;hb=db867ac06ddeb858f5a70f682e71826346c31895;hp=28b11bc669f3cbdb146a4a41c4dc2afc802ae2a9;hpb=cde823acd318d713335c4ec5ea35e708063abea5;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 28b11bc..435c3f2 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -105,7 +105,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { - System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString()); + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder.toString() + " of file " + pathToSelectorsEntry.getKey()); throw e; } @@ -130,10 +130,12 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { private void endPage() { final String title = titleBuilder.toString(); lastPageTitle = title; - if (++pageCount % 1000 == 0) { + if (++pageCount % 100000 == 0) { System.out.println("endPage: " + title + ", count=" + pageCount); } - if (title.startsWith("Wiktionary:") || + if (title.startsWith("Unsupported titles/")) return; + if (title.contains(":")) { + if (title.startsWith("Wiktionary:") || title.startsWith("Appendix:") || title.startsWith("Help:") || title.startsWith("Index:") || @@ -144,7 +146,6 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Rhymes:") || title.startsWith("Category:") || title.startsWith("Wikisaurus:") || - title.startsWith("Unsupported titles/") || title.startsWith("Transwiki:") || title.startsWith("File:") || title.startsWith("Thread:") || @@ -186,27 +187,36 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Plantilla:") || title.startsWith("Wikcionario:") || + // PT + title.startsWith("Ajuda:") || + title.startsWith("Apêndice:") || + title.startsWith("Citações:") || + title.startsWith("Portal:") || + title.startsWith("Predefinição:") || + title.startsWith("Vocabulário:") || + title.startsWith("Wikcionário:") || + // sentinel false - ) { - return; - } - if (title.contains(":")) { + ) return; if (!title.startsWith("Sign gloss:")) { System.err.println("title with colon: " + title); } } String text = textBuilder.toString(); + // Workaround for Spanish wiktionary {{ES}} pattern + text = text.replace("{{ES}}", "== {{lengua|es}} =="); String translingual = ""; + int start = 0; + final Matcher startMatcher = headingStart.matcher(text); - while (text.length() > 0) { + while (start < text.length()) { // Find start. - final Matcher startMatcher = headingStart.matcher(text); - if (!startMatcher.find()) { + if (!startMatcher.find(start)) { return; } - text = text.substring(startMatcher.end()); + start = startMatcher.end(); final String heading = startMatcher.group(); @@ -218,10 +228,10 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { final Pattern endPattern = getEndPattern(depth); final Matcher endMatcher = endPattern.matcher(text); - if (endMatcher.find()) { + if (endMatcher.find(start)) { int end = endMatcher.start(); - translingual = text.substring(0, endMatcher.start()); - text = text.substring(end); + translingual = text.substring(start, end); + start = end; continue; } } @@ -234,13 +244,13 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { final Matcher endMatcher = endPattern.matcher(text); final int end; - if (endMatcher.find()) { + if (endMatcher.find(start)) { end = endMatcher.start(); } else { end = text.length(); } - String sectionText = text.substring(0, end); + String sectionText = text.substring(start, end); // Hack to remove empty dummy section from French if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) { int dummy_end = sectionText.indexOf("}}", 41) + 2; @@ -262,7 +272,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { throw new RuntimeException(e); } - text = text.substring(end); + start = end; break; } }