]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/WiktionarySplitter.java
WiktionarySplitter: implement parallel processing
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / WiktionarySplitter.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.BufferedInputStream;
18 import java.io.BufferedOutputStream;
19 import java.io.DataOutputStream;
20 import java.io.File;
21 import java.io.FileInputStream;
22 import java.io.FileOutputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.OutputStream;
26 import java.nio.charset.StandardCharsets;
27 import java.util.ArrayList;
28 import java.util.LinkedHashMap;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.concurrent.ExecutorService;
32 import java.util.concurrent.Executors;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35
36 import javax.xml.parsers.ParserConfigurationException;
37 import javax.xml.parsers.SAXParser;
38 import javax.xml.parsers.SAXParserFactory;
39
40 import org.apache.commons.compress.compressors.CompressorStreamFactory;
41 import org.xml.sax.Attributes;
42 import org.xml.sax.SAXException;
43
44 import com.hughes.android.dictionary.parser.wiktionary.WiktionaryLangs;
45
46 public class WiktionarySplitter extends org.xml.sax.helpers.DefaultHandler implements Runnable {
47
48     // The matches the whole line, otherwise regexes don't work well on French:
49     // {{=uk=}}
50     // Spanish has no initial headings, tried to also detect {{ES as such
51     // with "^(\\{\\{ES|(=+)[^=]).*$" but that broke English.
52     static final Pattern headingStartPattern = Pattern.compile("^(=+)[^=].*$", Pattern.MULTILINE);
53     static final Pattern startSpanish = Pattern.compile("\\{\\{ES(\\|[^{}=]*)?}}");
54
55     final Map.Entry<String, List<Selector>> pathToSelectorsEntry;
56     List<Selector> currentSelectors = null;
57
58     StringBuilder titleBuilder;
59     StringBuilder textBuilder;
60     StringBuilder currentBuilder = null;
61
62     public static void main(final String[] args) throws Exception {
63         boolean parallel = args.length > 0 && args[0].equals("parallel");
64         final ExecutorService e = Executors.newCachedThreadPool();
65         final Map<String,List<Selector>> pathToSelectors = createSelectorsMap();
66         for (final Map.Entry<String, List<Selector>> pathToSelectorsEntry : pathToSelectors.entrySet()) {
67             final WiktionarySplitter wiktionarySplitter = new WiktionarySplitter(pathToSelectorsEntry);
68             if (parallel) {
69                 e.submit(wiktionarySplitter);
70             } else wiktionarySplitter.go();
71         }
72         e.shutdown();
73     }
74
75     private WiktionarySplitter(final Map.Entry<String, List<Selector>> pathToSelectorsEntry) {
76         this.pathToSelectorsEntry = pathToSelectorsEntry;
77     }
78
79     private static Map<String,List<Selector>> createSelectorsMap() {
80         final Map<String,List<Selector>> pathToSelectors = new LinkedHashMap<>();
81         List<Selector> selectors;
82         for (final String code : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.keySet()) {
83             //if (!code.equals("fr")) {continue;}
84             selectors = new ArrayList<>();
85             pathToSelectors.put(String.format("data/inputs/%swiktionary-pages-articles.xml", code), selectors);
86             for (final Map.Entry<String, String> entry : WiktionaryLangs.wikiCodeToIsoCodeToWikiName.get(code).entrySet()) {
87                 final String dir = String.format("data/inputs/wikiSplit/%s", code);
88                 new File(dir).mkdirs();
89                 selectors.add(new Selector(String.format("%s/%s.data", dir, entry.getKey()), entry.getValue()));
90             }
91         }
92         return pathToSelectors;
93     }
94
95     @Override
96     public void run() {
97         try {
98             go();
99         } catch (Exception e) {
100             throw new RuntimeException(e);
101         }
102     }
103
104     private void go() throws Exception {
105         final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
106
107         // Configure things.
108
109             currentSelectors = pathToSelectorsEntry.getValue();
110
111             for (final Selector selector : currentSelectors) {
112                 OutputStream tmp = new FileOutputStream(selector.outFilename + ".gz");
113                 tmp = new BufferedOutputStream(tmp);
114                 tmp = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, tmp);
115                 tmp = new WriteBuffer(tmp, 1024 * 1024);
116                 selector.out = new DataOutputStream(tmp);
117             }
118
119             // Do it.
120             try {
121                 File input = new File(pathToSelectorsEntry.getKey() + ".bz2");
122                 if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".gz");
123                 if (!input.exists()) input = new File(pathToSelectorsEntry.getKey() + ".xz");
124                 if (!input.exists()) {
125                     // Fallback to uncompressed file
126                     parser.parse(new File(pathToSelectorsEntry.getKey()), this);
127                 } else {
128                     InputStream compressedIn = new BufferedInputStream(new FileInputStream(input));
129                     InputStream in = new CompressorStreamFactory().createCompressorInputStream(compressedIn);
130                     in = new ReadAheadBuffer(in, 20 * 1024 * 1024);
131                     parser.parse(new BufferedInputStream(in), this);
132                 }
133             } catch (Exception e) {
134                 System.err.println("Exception during parse, lastPageTitle=" + lastPageTitle + ", titleBuilder=" + titleBuilder + " of file " + pathToSelectorsEntry.getKey());
135                 throw e;
136             }
137
138             // Shutdown.
139             for (final Selector selector : currentSelectors) {
140                 selector.out.close();
141             }
142     }
143
144     String lastPageTitle = null;
145     int pageCount = 0;
146     final Matcher[] endPatterns = new Matcher[100];
147
148     private Matcher getEndPattern(int depth) {
149         if (endPatterns[depth] == null)
150             endPatterns[depth] = Pattern.compile(String.format("^={1,%d}[^=].*$", depth), Pattern.MULTILINE).matcher("");
151         return endPatterns[depth];
152     }
153
154     private void endPage() {
155         final String title = titleBuilder.toString();
156         lastPageTitle = title;
157         if (++pageCount % 100000 == 0) {
158             System.out.println("endPage: " + title + ", count=" + pageCount);
159         }
160         if (title.startsWith("Unsupported titles/")) return;
161         if (title.contains(":")) {
162             if (title.startsWith("Wiktionary:") ||
163                 title.startsWith("Appendix:") ||
164                 title.startsWith("Help:") ||
165                 title.startsWith("Index:") ||
166                 title.startsWith("MediaWiki:") ||
167                 title.startsWith("Citations:") ||
168                 title.startsWith("Concordance:") ||
169                 title.startsWith("Glossary:") ||
170                 title.startsWith("Rhymes:") ||
171                 title.startsWith("Category:") ||
172                 title.startsWith("Wikisaurus:") ||
173                 title.startsWith("Transwiki:") ||
174                 title.startsWith("File:") ||
175                 title.startsWith("Thread:") ||
176                 title.startsWith("Template:") ||
177                 title.startsWith("Summary:") ||
178                 title.startsWith("Module:") ||
179                 title.startsWith("Reconstruction:") ||
180                 // DE
181                 title.startsWith("Datei:") ||
182                 title.startsWith("Verzeichnis:") ||
183                 title.startsWith("Vorlage:") ||
184                 title.startsWith("Thesaurus:") ||
185                 title.startsWith("Kategorie:") ||
186                 title.startsWith("Hilfe:") ||
187                 title.startsWith("Reim:") ||
188                 title.startsWith("Modul:") ||
189                 // FR:
190                 title.startsWith("Annexe:") ||
191                 title.startsWith("Catégori:") ||
192                 title.startsWith("Modèle:") ||
193                 title.startsWith("Thésaurus:") ||
194                 title.startsWith("Projet:") ||
195                 title.startsWith("Aide:") ||
196                 title.startsWith("Fichier:") ||
197                 title.startsWith("Wiktionnaire:") ||
198                 title.startsWith("Translations:Wiktionnaire:") ||
199                 title.startsWith("Translations:Projet:") ||
200                 title.startsWith("Catégorie:") ||
201                 title.startsWith("Portail:") ||
202                 title.startsWith("utiliusateur:") ||
203                 title.startsWith("Kategorio:") ||
204                 title.startsWith("Tutoriel:") ||
205                 // IT
206                 title.startsWith("Wikizionario:") ||
207                 title.startsWith("Appendice:") ||
208                 title.startsWith("Categoria:") ||
209                 title.startsWith("Aiuto:") ||
210                 title.startsWith("Portail:") ||
211                 title.startsWith("Modulo:") ||
212                 // ES
213                 title.startsWith("Apéndice:") ||
214                 title.startsWith("Archivo:") ||
215                 title.startsWith("Ayuda:") ||
216                 title.startsWith("Categoría:") ||
217                 title.startsWith("Plantilla:") ||
218                 title.startsWith("Wikcionario:") ||
219
220                 // PT
221                 title.startsWith("Ajuda:") ||
222                 title.startsWith("Apêndice:") ||
223                 title.startsWith("Citações:") ||
224                 title.startsWith("Portal:") ||
225                 title.startsWith("Predefinição:") ||
226                 title.startsWith("Vocabulário:") ||
227                 title.startsWith("Wikcionário:") ||
228                 title.startsWith("Módulo:") ||
229
230                 // sentinel
231                 false
232                ) return;
233             // leave the Flexion: pages in for now and do not warn about them
234             if (!title.startsWith("Sign gloss:") && !title.startsWith("Flexion:")) {
235                 System.err.println("title with colon: " + title);
236             }
237         }
238
239         String text = textBuilder.toString();
240         // Workaround for Spanish wiktionary {{ES}} and {{ES|word}} patterns
241         text = startSpanish.matcher(text).replaceAll("== {{lengua|es}} ==");
242         String translingual = "";
243         int start = 0;
244         Matcher headingStart = headingStartPattern.matcher(text);
245
246         while (start < text.length()) {
247             // Find start.
248             if (!headingStart.find(start)) {
249                 return;
250             }
251             start = headingStart.end();
252
253             final String heading = headingStart.group();
254
255             // For Translingual entries just store the text for later
256             // use in the per-language sections
257             if (heading.contains("Translingual")) {
258                 // Find end.
259                 final int depth = headingStart.group(1).length();
260                 final Matcher endMatcher = getEndPattern(depth).reset(text);
261
262                 if (endMatcher.find(start)) {
263                     int end = endMatcher.start();
264                     translingual = text.substring(start, end);
265                     start = end;
266                     continue;
267                 }
268             }
269
270             for (final Selector selector : currentSelectors) {
271                 if (selector.pattern.reset(heading).find()) {
272                     // Find end.
273                     final int depth = headingStart.group(1).length();
274                     final Matcher endMatcher = getEndPattern(depth).reset(text);
275
276                     final int end;
277                     if (endMatcher.find(start)) {
278                         end = endMatcher.start();
279                     } else {
280                         end = text.length();
281                     }
282
283                     String sectionText = text.substring(start, end);
284                     // Hack to remove empty dummy section from French
285                     if (sectionText.startsWith("\n=== {{S|étymologie}} ===\n: {{ébauche-étym")) {
286                         int dummy_end = sectionText.indexOf("}}", 41) + 2;
287                         while (dummy_end + 1 < sectionText.length() &&
288                                 sectionText.charAt(dummy_end) == '\n' &&
289                                 sectionText.charAt(dummy_end + 1) == '\n') ++dummy_end;
290                         sectionText = sectionText.substring(dummy_end);
291                     }
292                     if (!heading.contains("Japanese")) sectionText += translingual;
293                     final Section section = new Section(title, heading, sectionText);
294
295                     try {
296                         selector.out.writeUTF(section.title);
297                         selector.out.writeUTF(section.heading);
298                         final byte[] bytes = section.text.getBytes(StandardCharsets.UTF_8);
299                         selector.out.writeInt(bytes.length);
300                         selector.out.write(bytes);
301                     } catch (IOException e) {
302                         throw new RuntimeException(e);
303                     }
304
305                     start = end;
306                     break;
307                 }
308             }
309         }
310
311     }
312
313     // -----------------------------------------------------------------------
314
315     static class Section implements java.io.Serializable {
316         private static final long serialVersionUID = -7676549898325856822L;
317
318         final String title;
319         final String heading;
320         final String text;
321
322         public Section(final String title, final String heading, final String text) {
323             this.title = title;
324             this.heading = heading;
325             this.text = text;
326
327             //System.out.printf("TITLE:%s\nHEADING:%s\nTEXT:%s\n\n\n\n\n\n", title, heading, text);
328         }
329     }
330
331     static class Selector {
332         final String outFilename;
333         final Matcher pattern;
334
335         DataOutputStream out;
336
337         public Selector(final String filename, final String pattern) {
338             this.outFilename = filename;
339             this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE).matcher("");
340         }
341     }
342
343     // -----------------------------------------------------------------------
344
345     @Override
346     public void startElement(String uri, String localName, String qName,
347                              Attributes attributes) {
348         currentBuilder = null;
349         if ("page".equals(qName)) {
350             titleBuilder = new StringBuilder();
351
352             // Start with "\n" to better match certain strings.
353             textBuilder = new StringBuilder("\n");
354         } else if ("title".equals(qName)) {
355             currentBuilder = titleBuilder;
356         } else if ("text".equals(qName)) {
357             currentBuilder = textBuilder;
358         }
359     }
360
361     @Override
362     public void characters(char[] ch, int start, int length) {
363         if (currentBuilder != null) {
364             currentBuilder.append(ch, start, length);
365         }
366     }
367
368     @Override
369     public void endElement(String uri, String localName, String qName) {
370         currentBuilder = null;
371         if ("page".equals(qName)) {
372             endPage();
373         }
374     }
375
376     public void parse(final File file) throws ParserConfigurationException,
377         SAXException, IOException {
378         final SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
379         parser.parse(file, this);
380     }
381
382 }