]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
72ad9eba138e81f957c4633ba661eacefeb00b5a
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.RandomAccessFile;
22 import java.util.Collections;
23
24 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
25 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
26 import com.hughes.util.FileUtil;
27
28 import junit.framework.TestCase;
29
30 public class DictionaryBuilderTest extends TestCase {
31
32     public static final String TEST_INPUTS = "testdata/inputs/";
33     public static final String WIKISPLIT = "data/inputs/wikiSplit/";
34     public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
35     public static final String STOPLISTS = "data/inputs/stoplists/";
36     public static final String GOLDENS = "testdata/goldens/";
37
38     public static final String TEST_OUTPUTS = "testdata/outputs/";
39
40     public void testItConj() throws Exception {
41         final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
42                                "{{it-conj-are|accus|avere}}\n" +
43                                "{{it-conj-care|pag|avere or essere}}\n" +
44                                "{{it-conj-iare|studi|avere}}\n" +
45                                "{{it-conj-iare-b|avvi|avere}}\n" +
46                                "{{it-conj-ciare|pronunc|avere}}\n" +
47                                "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
48                                "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
49                                "{{it-conj-ere|abbatt|avere}}\n" +
50                                "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
51                                "{{it-conj-ire-b|prefer|avere}}\n" +
52                                "{{it-conj-urre|prod|avere}}\n" +
53                                "{{it-conj-arsi|lav}}\n" +
54                                "{{it-conj-ersi|abbatt}}\n" +
55                                "{{it-conj-iarsi|annoi}}\n" +
56                                "{{it-conj-carsi|coniug}}\n" +
57                                "{{it-conj-ciarsi|affacc}}\n" +
58                                "{{it-conj-irsi|vest}}\n" +
59                                "{{it-conj-irsi-b|fer}}\n" +
60                                "{{it-conj-ursi|rid|essere}}\n" +
61                                "{{it-conj-cire|ricuc|avere}}\n" +
62                                "{{it-conj-iarsi-b|riavvi|essere}}" +
63                                "{{it-conj-fare|putre|avere}}\n" +
64                                "{{it-conj-cirsi|cuc|essere}}\n" +
65                                "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
66                                "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
67                                "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
68                                "{{term|verbo|verbō|for the word}}\n"
69                                ;
70         final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
71         WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
72         parser.title = "dummyTitle";
73         parser.entrySource = new EntrySource(0, "dummySource", 0);
74         parser.parseSection("dummyHeading", toParse);
75         db.build();
76
77         final String dictName = "testItConj.html";
78         final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
79         db.dictionary.print(out);
80         out.close();
81
82         assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
83     }
84
85     public void doTestCustomDict(final String name, final String lang1,
86                                  final String lang2, final String inputFile) throws Exception {
87         final File result = new File(TEST_OUTPUTS + name);
88         System.out.println("Writing to: " + result);
89         DictionaryBuilder.main(new String[] {
90                                    "--dictOut=" + result.getAbsolutePath(),
91                                    "--lang1=" + lang1,
92                                    "--lang2=" + lang2,
93                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
94                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
95                                    "--dictInfo=bleh.",
96
97                                    "--input1=testdata/inputs/" + inputFile,
98                                    "--input1Name=my_input_" + name,
99                                    "--input1Charset=ISO-8859-1",
100                                    "--input1Format=tab_separated",
101
102                                    "--print=" + result.getPath() + ".text",
103                                });
104
105         checkGolden(name, result);
106     }
107
108     public void test_FR_NL() throws Exception {
109         doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
110     }
111
112     public void testWiktionary_en_de2fr() throws Exception {
113         wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
114     }
115
116     public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
117             final String lang2) throws Exception {
118         final File result = new File(TEST_OUTPUTS + name);
119         System.out.println("Writing to: " + result);
120         DictionaryBuilder.main(new String[] {
121                                    "--dictOut=" + result.getAbsolutePath(),
122                                    "--lang1=" + lang1,
123                                    "--lang2=" + lang2,
124                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
125                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
126                                    "--dictInfo=SomeWikiDataTrans2Trans",
127
128                                    "--input4=" + WIKISPLIT_EN + "EN.data",
129                                    "--input4Name=" + name,
130                                    "--input4Format=" + EnTranslationToTranslationParser.NAME,
131                                    "--input4LangPattern1=" + lang1,
132                                    "--input4LangPattern2=" + lang2,
133                                    "--input4PageLimit=1000",
134
135                                    "--print=" + result.getPath() + ".text",
136                                });
137
138         checkGolden(name, result);
139     }
140
141     public void testWiktionary_WholeSection_DE() throws Exception {
142         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100);
143     }
144
145     public void testWiktionary_WholeSection_EN() throws Exception {
146         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100);
147     }
148
149     public void testWiktionary_WholeSection_IT() throws Exception {
150         // Have to run to 800 to get a few verb conjugations (including essere!)
151         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800);
152     }
153
154     public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
155         final File result = new File(TEST_OUTPUTS + name);
156         System.out.println("Writing to: " + result);
157         DictionaryBuilder.main(new String[] {
158                                    "--dictOut=" + result.getAbsolutePath(),
159                                    "--lang1=" + langCode,
160                                    "--lang2=" + "EN",
161                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
162                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
163                                    "--dictInfo=SomeWikiDataWholeSection",
164
165                                    "--input4=" + WIKISPLIT_EN + langCode + ".data",
166                                    "--input4Name=" + name,
167                                    "--input4Format=" + WholeSectionToHtmlParser.NAME,
168                                    "--input4WiktionaryLang=EN",
169                                    "--input4SkipLang=" + langCode,
170                                    "--input4TitleIndex=" + "1",
171                                    "--input4PageLimit=" + pageLimit,
172
173                                    "--print=" + result.getPath() + ".text",
174                                });
175         checkGolden(name, result);
176     }
177
178     //-----------------------------------------------------------------
179
180     public void testSingleLang_EN() throws Exception {
181         wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
182     }
183
184     public void testSingleLang_DE() throws Exception {
185         wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
186     }
187
188     public void testSingleLang_IT() throws Exception {
189         wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
190     }
191
192     public void testSingleLang_FR() throws Exception {
193         wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100);
194     }
195
196     public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
197         final File result = new File(TEST_OUTPUTS + name);
198         System.out.println("Writing to: " + result);
199         DictionaryBuilder.main(new String[] {
200                                    "--dictOut=" + result.getAbsolutePath(),
201                                    "--lang1=" + langCode,
202                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
203                                    "--dictInfo=SomeWikiDataWholeSection",
204                                    "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
205                                    "--input4Name=" + name,
206                                    "--input4Format=" + WholeSectionToHtmlParser.NAME,
207                                    "--input4WiktionaryLang=" + langCode,
208                                    "--input4SkipLang=" + langCode,
209                                    "--input4TitleIndex=" + "1",
210                                    "--input4PageLimit=" + pageLimit,
211                                    "--print=" + result.getPath() + ".text",
212                                });
213         checkGolden(name, result);
214     }
215
216     //-----------------------------------------------------------------
217
218     public void testWiktionary_IT_EN() throws Exception {
219         wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
220                                    "EN.data", "enwiktionary.english", "Italian", "it", 1000);
221     }
222
223     public void testWiktionary_cmn_EN() throws Exception {
224         wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt",
225                                    // These missing "e" prevents a complete match, forcing the name to be printed
226                                    "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000);
227     }
228
229     public void testWiktionary_DE_EN() throws Exception {
230         wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
231                                    "EN.data", "enwiktionary.english", "German", "de", 1000);
232     }
233
234     public void testWiktionary_IT_IT() throws Exception {
235         wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
236                                    "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
237     }
238
239     // French
240     public void testWiktionary_FR_FR() throws Exception {
241         wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
242                                    "FR.data", "enwiktionary.french", "French", "fr", 1000);
243     }
244
245
246     // Arabic
247     public void testWiktionary_AR_AR() throws Exception {
248         // Arabic is really big for some reason, use fewer pages.
249         wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
250                                    "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
251     }
252
253     // Chinese
254     public void testWiktionary_cmn_cmn() throws Exception {
255         wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt",
256                                    // These missing "e" prevents a complete match, forcing the name to be printed.
257                                    "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000);
258     }
259
260     // German
261     public void testWiktionary_DE_DE() throws Exception {
262         wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
263                                    "DE.data", "enwiktionary.german", "German", "de", 1000);
264     }
265
266     // Thai
267     public void testWiktionary_TH_TH() throws Exception {
268         wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
269                                    // These missing "e" prevents a complete match, forcing the name to be printed.
270                                    "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
271     }
272
273     public void wiktionaryTestWithLangToEn(final String name, final String lang1,
274                                            final String stoplist, final String data, final String dictName,
275                                            final String langPattern, final String langCode, int pageLimit) throws Exception {
276         final File result = new File(TEST_OUTPUTS + name);
277         System.out.println("Writing to: " + result);
278         final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
279         DictionaryBuilder.main(new String[] {
280                                    "--dictOut=" + result.getAbsolutePath(),
281                                    "--lang1=" + lang1,
282                                    "--lang2=EN",
283                                    "--lang1Stoplist=" + STOPLISTS + stoplist,
284                                    "--lang2Stoplist=" + STOPLISTS + "en.txt",
285                                    "--dictInfo=SomeWikiData",
286
287                                    "--input4=" + WIKISPLIT_EN + data,
288                                    "--input4Name=" + dictName,
289                                    "--input4Format=enwiktionary",
290                                    "--input4WiktionaryType=" + type,
291                                    "--input4LangPattern=" + langPattern,
292                                    "--input4LangCodePattern=" + langCode,
293                                    "--input4EnIndex=2",
294                                    "--input4PageLimit=" + pageLimit,
295
296                                    "--print=" + result.getPath() + ".text",
297                                });
298
299         checkGolden(name, result);
300     }
301
302     public void testGermanCombined() throws Exception {
303         final String name = "de-en.quickdic";
304         final File result = new File(TEST_OUTPUTS + name);
305         System.out.println("Writing to: " + result);
306         DictionaryBuilder.main(new String[] {
307                                    "--dictOut=" + result.getAbsolutePath(),
308                                    "--lang1=DE",
309                                    "--lang2=EN",
310                                    "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
311
312                                    "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
313                                    "--input1Name=chemnitz",
314                                    "--input1Charset=UTF8",
315                                    "--input1Format=chemnitz",
316
317                                    "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
318                                    "--input2Name=dictcc",
319                                    "--input2Charset=UTF8",
320                                    "--input2Format=tab_separated",
321
322                                    "--print=" + result.getPath() + ".text",
323                                });
324
325         checkGolden(name, result);
326     }
327
328     public void testItalianTurkish() throws Exception {
329         final String name = "it-tr_dictcc.quickdic";
330         final File result = new File(TEST_OUTPUTS + name);
331         System.out.println("Writing to: " + result);
332         DictionaryBuilder.main(new String[] {
333                                    "--dictOut=" + result.getAbsolutePath(),
334                                    "--lang1=IT",
335                                    "--lang2=TR",
336                                    "--dictInfo=it-tr_dictcc_simulated",
337
338                                    "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt",
339                                    "--input1Name=dictcc",
340                                    "--input1Charset=UTF8",
341                                    "--input1Format=tab_separated",
342
343                                    "--print=" + result.getPath() + ".text",
344                                });
345
346         checkGolden(name, result);
347     }
348
349     private void checkGolden(final String dictName, final File dictFile)
350     throws IOException, FileNotFoundException {
351         // Check it once:
352         assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
353
354         // Check it again.
355         final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r").getChannel());
356         final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
357         dict.print(out);
358         out.close();
359         assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
360     }
361
362
363     void assertFilesEqual(final String expected, final String actual) throws IOException {
364         final String expectedString = FileUtil.readToString(new File(expected));
365         final String actualString = FileUtil.readToString(new File(actual));
366         assertEquals(expectedString, actualString);
367     }
368
369
370 }