X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=src%2Fcom%2Fhughes%2Fandroid%2Fdictionary%2Fengine%2FWiktionarySplitter.java;h=b148ebb2d70d0c9b9b91c6272f030cf579cb3576;hb=2fc669d88306d563fc9c899d8d91b25d591692ea;hp=f5c85cf0f4f1e8cb41d500892f2bf52f64781c07;hpb=7819736ae570bf597936f0dc640f60644da15fc8;p=DictionaryPC.git diff --git a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java index f5c85cf..b148ebb 100644 --- a/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java +++ b/src/com/hughes/android/dictionary/engine/WiktionarySplitter.java @@ -14,14 +14,20 @@ package com.hughes.android.dictionary.engine; +import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; -import java.util.Arrays; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -29,208 +35,326 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; +import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs; + public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler { - - static class Section implements java.io.Serializable { - private static final long serialVersionUID = -7676549898325856822L; - - final String title; - final String heading; - final String text; - - public Section(final String title, final String heading, final String text) { - this.title = title; - this.heading = heading; - this.text = text; - - //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + + // The matches the whole line, otherwise regexes don't work well on French: + // {{=uk=}} + // Spanish has no initial headings, tried to also detect {{ES as such + // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English. + static final Matcher headingStart = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE).matcher(""); + static final Matcher startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}").matcher(""); + + final Map> pathToSelectors = new LinkedHashMap<>(); + List currentSelectors = null; + + StringBuilder titleBuilder; + StringBuilder textBuilder; + StringBuilder currentBuilder = null; + + public static void main(final String[] args) throws Exception { + final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); + wiktionarySplitter.go(); } - } - - static class Selector { - DataOutputStream out; - Pattern pattern; - - public Selector(final String filename, final String pattern) throws IOException { - this.out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); - this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); + + private WiktionarySplitter() { + List selectors; + for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) { + //if (!code.equals("fr")) {continue;} + selectors = new ArrayList<>(); + pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors); + for (final Map.Entry entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) { + final String dir = String.format("data/inputs/wikiSplit/%s", code); + new File(dir).mkdirs(); + selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue())); + } + } } - } - - final List selectors = new ArrayList(); - StringBuilder titleBuilder; - StringBuilder textBuilder; - StringBuilder currentBuilder = null; - - public static void main(final String[] args) throws SAXException, IOException, ParserConfigurationException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(); - - // Configure things. - final File file = new File(args[0]); - final List selectors = wiktionarySplitter.selectors; - for (int i = 1; i < args.length; i += 2) { - final Selector selector = new Selector(args[i], args[i+1]); - selectors.add(selector); + + private void go() throws Exception { + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + + // Configure things. + for (final Map.Entry> pathToSelectorsEntry : pathToSelectors.entrySet()) { + + currentSelectors = pathToSelectorsEntry.getValue(); + + for (final Selector selector : currentSelectors) { + OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz"); + tmp = new BufferedOutputStream(tmp); + tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp); + tmp = new WriteBuffer(tmp, 20 * 1024 * 1024); + selector.out = new DataOutputStream(tmp); + } + + // Do it. + try { + File input = new File(pathToSelectorsEntry.getKey() + ".bz2"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz"); + if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz"); + if (!input.exists()) { + // Fallback to uncompressed file + parser.parse(new File(pathToSelectorsEntry.getKey()), this); + } else { + InputStream compressedIn = new BufferedInputStream(new FileInputStream(input)); + InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn); + in = new ReadAheadBuffer(in, 20 * 1024 * 1024); + parser.parse(new BufferedInputStream(in), this); + } + } catch (Exception e) { + System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey()); + throw e; + } + + // Shutdown. + for (final Selector selector : currentSelectors) { + selector.out.close(); + } + + } } - if (selectors.isEmpty()) { - selectors.addAll(Arrays.asList( -// new Selector("../DictionaryData/inputs/enWikiSplit/AF.data", ".*[Aa]frikaans.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/AR.data", ".*[Aa]rabic.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/HY.data", ".*[Aa]rmenian.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/HR.data", ".*[Cc]roatian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/CS.data", ".*[Cc]zech.*"), - new Selector("../DictionaryData/inputs/enWikiSplit/ZH.data", ".*[Cc]hinese.*|.*[Mm]andarin.*|.*Cantonese.*") -// new Selector("../DictionaryData/inputs/enWikiSplit/DA.data", ".*[Dd]anish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/NL.data", ".*[Dd]utch.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/EN.data", ".*[Ee]nglish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/FI.data", ".*[Ff]innish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/FR.data", ".*[Ff]rench.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/DE.data", ".*[Gg]erman.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/EL.data", ".*[Gg]reek.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/haw.data", ".*[Hh]awaiian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/HE.data", ".*[Hh]ebrew.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/HI.data", ".*[Hh]indi.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/IS.data", ".*[Ii]celandic.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/GA.data", ".*[Ii]rish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/IT.data", ".*[Ii]talian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/LT.data", ".*[Ll]ithuanian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/JA.data", ".*[Jj]apanese.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/KO.data", ".*[Kk]orean.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/KU.data", ".*[Kk]urdish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/MS.data", ".*[Mm]alay.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/MI.data", ".*[Mm]aori.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/MN.data", ".*[Mm]ongolian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/NO.data", ".*[Nn]orwegian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/FA.data", ".*[Pp]ersian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/PT.data", ".*[Pp]ortuguese.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/PL.data", ".*[Pp]olish.*") -// new Selector("../DictionaryData/inputs/enWikiSplit/RO.data", ".*[Rr]omanian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/RU.data", ".*[Rr]ussian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/SA.data", ".*[Ss]anskrit.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/SR.data", ".*[Ss]erbian.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/SO.data", ".*[Ss]omali.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/ES.data", ".*[Ss]panish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/SV.data", ".*[Ss]wedish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/TG.data", ".*[Tt]ajik.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/TH.data", ".*[Tt]hai.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/BO.data", ".*[Tt]ibetan.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/TR.data", ".*[Tt]urkish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/UK.data", ".*[Uu]krainian.*") -// new Selector("../DictionaryData/inputs/enWikiSplit/VI.data", ".*[Vv]ietnamese.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/CI.data", ".*[Ww]elsh.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/YI.data", ".*[Yy]iddish.*"), -// new Selector("../DictionaryData/inputs/enWikiSplit/ZU.data", ".*[Zz]ulu.*") - )); + String lastPageTitle = null; + int pageCount = 0; + final Matcher[] endPatterns = new Matcher[100]; + + private Matcher getEndPattern(int depth) { + if (endPatterns[depth] == null) + endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher(""); + return endPatterns[depth]; } - - // Do it. - parser.parse(file, wiktionarySplitter); - - // Shutdown. - for (final Selector selector : selectors) { - selector.out.close(); + + private void endPage() { + final String title = titleBuilder.toString(); + lastPageTitle = title; + if (++pageCount % 100000 == 0) { + System.out.println("endPage: " + title + ", count=" + pageCount); + } + if (title.startsWith("Unsupported titles/")) return; + if (title.contains(":")) { + if (title.startsWith("Wiktionary:") || + title.startsWith("Appendix:") || + title.startsWith("Help:") || + title.startsWith("Index:") || + title.startsWith("MediaWiki:") || + title.startsWith("Citations:") || + title.startsWith("Concordance:") || + title.startsWith("Glossary:") || + title.startsWith("Rhymes:") || + title.startsWith("Category:") || + title.startsWith("Wikisaurus:") || + title.startsWith("Transwiki:") || + title.startsWith("File:") || + title.startsWith("Thread:") || + title.startsWith("Template:") || + title.startsWith("Summary:") || + title.startsWith("Module:") || + title.startsWith("Reconstruction:") || + // DE + title.startsWith("Datei:") || + title.startsWith("Verzeichnis:") || + title.startsWith("Vorlage:") || + title.startsWith("Thesaurus:") || + title.startsWith("Kategorie:") || + title.startsWith("Hilfe:") || + title.startsWith("Reim:") || + title.startsWith("Modul:") || + // FR: + title.startsWith("Annexe:") || + title.startsWith("Catégori:") || + title.startsWith("Modèle:") || + title.startsWith("Thésaurus:") || + title.startsWith("Projet:") || + title.startsWith("Aide:") || + title.startsWith("Fichier:") || + title.startsWith("Wiktionnaire:") || + title.startsWith("Translations:Wiktionnaire:") || + title.startsWith("Translations:Projet:") || + title.startsWith("Catégorie:") || + title.startsWith("Portail:") || + title.startsWith("utiliusateur:") || + title.startsWith("Kategorio:") || + title.startsWith("Tutoriel:") || + // IT + title.startsWith("Wikizionario:") || + title.startsWith("Appendice:") || + title.startsWith("Categoria:") || + title.startsWith("Aiuto:") || + title.startsWith("Portail:") || + title.startsWith("Modulo:") || + // ES + title.startsWith("Apéndice:") || + title.startsWith("Archivo:") || + title.startsWith("Ayuda:") || + title.startsWith("Categoría:") || + title.startsWith("Plantilla:") || + title.startsWith("Wikcionario:") || + + // PT + title.startsWith("Ajuda:") || + title.startsWith("Apêndice:") || + title.startsWith("Citações:") || + title.startsWith("Portal:") || + title.startsWith("Predefinição:") || + title.startsWith("Vocabulário:") || + title.startsWith("Wikcionário:") || + title.startsWith("Módulo:") || + + // sentinel + false + ) return; + // leave the Flexion: pages in for now and do not warn about them + if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) { + System.err.println("title with colon: " + title); + } + } + + String text = textBuilder.toString(); + // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns + text = startSpanish.reset(text).replaceAll("== {{lengua|es}} =="); + String translingual = ""; + int start = 0; + headingStart.reset(text); + + while (start < text.length()) { + // Find start. + if (!headingStart.find(start)) { + return; + } + start = headingStart.end(); + + final String heading = headingStart.group(); + + // For Translingual entries just store the text for later + // use in the per-language sections + if (heading.contains("Translingual")) { + // Find end. + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); + + if (endMatcher.find(start)) { + int end = endMatcher.start(); + translingual = text.substring(start, end); + start = end; + continue; + } + } + + for (final Selector selector : currentSelectors) { + if (selector.pattern.reset(heading).find()) { + // Find end. + final int depth = headingStart.group(1).length(); + final Matcher endMatcher = getEndPattern(depth).reset(text); + + final int end; + if (endMatcher.find(start)) { + end = endMatcher.start(); + } else { + end = text.length(); + } + + String sectionText = text.substring(start, end); + // Hack to remove empty dummy section from French + if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) { + int dummy_end = sectionText.indexOf("}}", 41) + 2; + while (dummy_end + 1 < sectionText.length() && + sectionText.charAt(dummy_end) == '\n' && + sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end; + sectionText = sectionText.substring(dummy_end); + } + if (!heading.contains("Japanese")) sectionText += translingual; + final Section section = new Section(title, heading, sectionText); + + try { + selector.out.writeUTF(section.title); + selector.out.writeUTF(section.heading); + final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8); + selector.out.writeInt(bytes.length); + selector.out.write(bytes); + } catch (IOException e) { + throw new RuntimeException(e); + } + + start = end; + break; + } + } + } + } - } - - static final Pattern headingStart = Pattern.compile("^(=+)[^=]+=+", Pattern.MULTILINE); - - int pageCount = 0; - private void endPage() { - final String title = titleBuilder.toString(); - if (++pageCount % 1000 == 0) { - System.out.println("endPage: " + title + ", count=" + pageCount); + + // ----------------------------------------------------------------------- + + static class Section implements java.io.Serializable { + private static final long serialVersionUID = -7676549898325856822L; + + final String title; + final String heading; + final String text; + + public Section(final String title, final String heading, final String text) { + this.title = title; + this.heading = heading; + this.text = text; + + //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text); + } } - - String text = textBuilder.toString(); - - while (text.length() > 0) { - // Find start. - final Matcher startMatcher = headingStart.matcher(text); - if (!startMatcher.find()) { - return; - } - text = text.substring(startMatcher.end()); - - final String heading = startMatcher.group(); - for (final Selector selector : selectors) { - if (selector.pattern.matcher(heading).find()) { - - // Find end. - final int depth = startMatcher.group(1).length(); - final Pattern endPattern = Pattern.compile(String.format("^={1,%d}[^=]+=+", depth), Pattern.MULTILINE); - - final Matcher endMatcher = endPattern.matcher(text); - final int end; - if (endMatcher.find()) { - end = endMatcher.start(); - } else { - end = text.length(); - } - - final String sectionText = text.substring(0, end); - final Section section = new Section(title, heading, sectionText); - - try { - selector.out.writeUTF(section.title); - selector.out.writeUTF(section.heading); - final byte[] bytes = section.text.getBytes("UTF8"); - selector.out.writeInt(bytes.length); - selector.out.write(bytes); - } catch (IOException e) { - throw new RuntimeException(e); - } - - text = text.substring(end); + + static class Selector { + final String outFilename; + final Matcher pattern; + + DataOutputStream out; + + public Selector(final String filename, final String pattern) { + this.outFilename = filename; + this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher(""); } - } } - - } - // ----------------------------------------------------------------------- - + // ----------------------------------------------------------------------- + @Override public void startElement(String uri, String localName, String qName, - Attributes attributes) { - currentBuilder = null; - if ("page".equals(qName)) { - titleBuilder = new StringBuilder(); - - // Start with "\n" to better match certain strings. - textBuilder = new StringBuilder("\n"); - } else if ("title".equals(qName)) { - currentBuilder = titleBuilder; - } else if ("text".equals(qName)) { - currentBuilder = textBuilder; - } + Attributes attributes) { + currentBuilder = null; + if ("page".equals(qName)) { + titleBuilder = new StringBuilder(); + + // Start with "\n" to better match certain strings. + textBuilder = new StringBuilder("\n"); + } else if ("title".equals(qName)) { + currentBuilder = titleBuilder; + } else if ("text".equals(qName)) { + currentBuilder = textBuilder; + } } @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (currentBuilder != null) { - currentBuilder.append(ch, start, length); - } + public void characters(char[] ch, int start, int length) { + if (currentBuilder != null) { + currentBuilder.append(ch, start, length); + } } @Override - public void endElement(String uri, String localName, String qName) - throws SAXException { - currentBuilder = null; - if ("page".equals(qName)) { - endPage(); - } + public void endElement(String uri, String localName, String qName) { + currentBuilder = null; + if ("page".equals(qName)) { + endPage(); + } } - public void parse(final File file) throws ParserConfigurationException, SAXException, IOException { - final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - parser.parse(file, this); + final SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); + parser.parse(file, this); } - + }