X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=37344cae59362fa1e5da2342bb36e2e98f708724;hb=eec1a89b6cdffe7048aefa3cb2b3497b1744be99;hp=290a58fccc1e38a6c36acdd44f2cb08cf42abb40;hpb=ee1dbfb669462305a1c07e4d804a90af79f5d39f;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index 290a58f..37344ca 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -22,6 +22,7 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.IOException; +import java.io.OutputStream; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; @@ -82,7 +83,11 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { currentSelectors = pathToSelectorsEntry.getValue(); for (final Selector selector : currentSelectors) { - selector.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(selector.outFilename))); + OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz"); + tmp = new BufferedOutputStream(tmp); + tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp); + tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + selector.out = new DataOutputStream(tmp); } // Do it. @@ -96,6 +101,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { } else { InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); parser.parse(new BufferedInputStream(in), this); } } catch (Exception e) { @@ -113,13 +119,23 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { String lastPageTitle = null; int pageCount = 0; + Pattern endPatterns[] = new Pattern[100]; + + private Pattern getEndPattern(int depth) { + if (endPatterns[depth] == null) + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + return endPatterns[depth]; + } + private void endPage() { final String title = titleBuilder.toString(); lastPageTitle = title; - if (++pageCount % 1000 == 0) { + if (++pageCount % 100000 == 0) { System.out.println("endPage: " + title + ", count=" + pageCount); } - if (title.startsWith("Wiktionary:") || + if (title.startsWith("Unsupported titles/")) return; + if (title.contains(":")) { + if (title.startsWith("Wiktionary:") || title.startsWith("Appendix:") || title.startsWith("Help:") || title.startsWith("Index:") || @@ -130,7 +146,6 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { title.startsWith("Rhymes:") || title.startsWith("Category:") || title.startsWith("Wikisaurus:") || - title.startsWith("Unsupported titles/") || title.startsWith("Transwiki:") || title.startsWith("File:") || title.startsWith("Thread:") || @@ -174,10 +189,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { // sentinel false - ) { - return; - } - if (title.contains(":")) { + ) return; if (!title.startsWith("Sign gloss:")) { System.err.println("title with colon: " + title); } @@ -185,45 +197,49 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { String text = textBuilder.toString(); String translingual = ""; + int start = 0; + final Matcher startMatcher = headingStart.matcher(text); - while (text.length() > 0) { + while (start < text.length()) { // Find start. - final Matcher startMatcher = headingStart.matcher(text); - if (!startMatcher.find()) { + if (!startMatcher.find(start)) { return; } - text = text.substring(startMatcher.end()); + start = startMatcher.end(); final String heading = startMatcher.group(); - for (final Selector selector : currentSelectors) { - if (heading.indexOf("Translingual") != -1) { - // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); - final Matcher endMatcher = endPattern.matcher(text); - if (endMatcher.find()) { - int end = endMatcher.start(); - translingual = text.substring(0, endMatcher.start()); - text = text.substring(end); - break; - } + // For Translingual entries just store the text for later + // use in the per-language sections + if (heading.indexOf("Translingual") != -1) { + // Find end. + final int depth = startMatcher.group(1).length(); + final Pattern endPattern = getEndPattern(depth); + + final Matcher endMatcher = endPattern.matcher(text); + if (endMatcher.find(start)) { + int end = endMatcher.start(); + translingual = text.substring(start, end); + start = end; + continue; } - if (selector.pattern.matcher(heading).find()) { + } + for (final Selector selector : currentSelectors) { + if (selector.pattern.matcher(heading).find()) { // Find end. final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE); + final Pattern endPattern = getEndPattern(depth); final Matcher endMatcher = endPattern.matcher(text); final int end; - if (endMatcher.find()) { + if (endMatcher.find(start)) { end = endMatcher.start(); } else { end = text.length(); } - String sectionText = text.substring(0, end); + String sectionText = text.substring(start, end); // Hack to remove empty dummy section from French if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) { int dummy_end = sectionText.indexOf("}}", 41) + 2; @@ -245,7 +261,7 @@ public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { throw new RuntimeException(e); } - text = text.substring(end); + start = end; break; } }