]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
Minor automated code simplifications.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.IOException;
19 import java.io.PrintStream;
20 import java.io.RandomAccessFile;
21 import java.util.Collections;
22
23 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
24 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
25 import com.hughes.util.FileUtil;
26
27 import junit.framework.TestCase;
28
29 public class DictionaryBuilderTest extends TestCase {
30
31     public static final String TEST_INPUTS = "testdata/inputs/";
32     public static final String WIKISPLIT = "data/inputs/wikiSplit/";
33     public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
34     public static final String STOPLISTS = "data/inputs/stoplists/";
35     public static final String GOLDENS = "testdata/goldens/";
36
37     public static final String TEST_OUTPUTS = "testdata/outputs/";
38
39     public void testItConj() throws Exception {
40         final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
41                                "{{it-conj-are|accus|avere}}\n" +
42                                "{{it-conj-care|pag|avere or essere}}\n" +
43                                "{{it-conj-iare|studi|avere}}\n" +
44                                "{{it-conj-iare-b|avvi|avere}}\n" +
45                                "{{it-conj-ciare|pronunc|avere}}\n" +
46                                "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
47                                "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
48                                "{{it-conj-ere|abbatt|avere}}\n" +
49                                "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
50                                "{{it-conj-ire-b|prefer|avere}}\n" +
51                                "{{it-conj-urre|prod|avere}}\n" +
52                                "{{it-conj-arsi|lav}}\n" +
53                                "{{it-conj-ersi|abbatt}}\n" +
54                                "{{it-conj-iarsi|annoi}}\n" +
55                                "{{it-conj-carsi|coniug}}\n" +
56                                "{{it-conj-ciarsi|affacc}}\n" +
57                                "{{it-conj-irsi|vest}}\n" +
58                                "{{it-conj-irsi-b|fer}}\n" +
59                                "{{it-conj-ursi|rid|essere}}\n" +
60                                "{{it-conj-cire|ricuc|avere}}\n" +
61                                "{{it-conj-iarsi-b|riavvi|essere}}" +
62                                "{{it-conj-fare|putre|avere}}\n" +
63                                "{{it-conj-cirsi|cuc|essere}}\n" +
64                                "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
65                                "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
66                                "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
67                                "{{term|verbo|verbō|for the word}}\n"
68                                ;
69         final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
70         WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
71         parser.title = "dummyTitle";
72         parser.entrySource = new EntrySource(0, "dummySource", 0);
73         parser.parseSection("dummyHeading", toParse);
74         db.build();
75
76         final String dictName = "testItConj.html";
77         final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
78         db.dictionary.print(out);
79         out.close();
80
81         assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
82     }
83
84     public void doTestCustomDict(final String name, final String lang1,
85                                  final String lang2, final String inputFile) throws Exception {
86         final File result = new File(TEST_OUTPUTS + name);
87         System.out.println("Writing to: " + result);
88         DictionaryBuilder.main(new String[] {
89                                    "--dictOut=" + result.getAbsolutePath(),
90                                    "--lang1=" + lang1,
91                                    "--lang2=" + lang2,
92                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
93                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
94                                    "--dictInfo=bleh.",
95
96                                    "--input1=testdata/inputs/" + inputFile,
97                                    "--input1Name=my_input_" + name,
98                                    "--input1Charset=ISO-8859-1",
99                                    "--input1Format=tab_separated",
100
101                                    "--print=" + result.getPath() + ".text",
102                                });
103
104         checkGolden(name, result);
105     }
106
107     public void test_FR_NL() throws Exception {
108         doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
109     }
110
111     public void testWiktionary_en_de2fr() throws Exception {
112         wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
113     }
114
115     public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
116             final String lang2) throws Exception {
117         final File result = new File(TEST_OUTPUTS + name);
118         System.out.println("Writing to: " + result);
119         DictionaryBuilder.main(new String[] {
120                                    "--dictOut=" + result.getAbsolutePath(),
121                                    "--lang1=" + lang1,
122                                    "--lang2=" + lang2,
123                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
124                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
125                                    "--dictInfo=SomeWikiDataTrans2Trans",
126
127                                    "--input4=" + WIKISPLIT_EN + "EN.data",
128                                    "--input4Name=" + name,
129                                    "--input4Format=" + EnTranslationToTranslationParser.NAME,
130                                    "--input4LangPattern1=" + lang1,
131                                    "--input4LangPattern2=" + lang2,
132                                    "--input4PageLimit=1000",
133
134                                    "--print=" + result.getPath() + ".text",
135                                });
136
137         checkGolden(name, result);
138     }
139
140     public void testWiktionary_WholeSection_DE() throws Exception {
141         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100);
142     }
143
144     public void testWiktionary_WholeSection_EN() throws Exception {
145         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100);
146     }
147
148     public void testWiktionary_WholeSection_IT() throws Exception {
149         // Have to run to 800 to get a few verb conjugations (including essere!)
150         enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800);
151     }
152
153     public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
154         final File result = new File(TEST_OUTPUTS + name);
155         System.out.println("Writing to: " + result);
156         DictionaryBuilder.main(new String[] {
157                                    "--dictOut=" + result.getAbsolutePath(),
158                                    "--lang1=" + langCode,
159                                    "--lang2=" + "EN",
160                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
161                                    "--lang2Stoplist=" + STOPLISTS + "empty.txt",
162                                    "--dictInfo=SomeWikiDataWholeSection",
163
164                                    "--input4=" + WIKISPLIT_EN + langCode + ".data",
165                                    "--input4Name=" + name,
166                                    "--input4Format=" + WholeSectionToHtmlParser.NAME,
167                                    "--input4WiktionaryLang=EN",
168                                    "--input4SkipLang=" + langCode,
169                                    "--input4TitleIndex=" + "1",
170                                    "--input4PageLimit=" + pageLimit,
171
172                                    "--print=" + result.getPath() + ".text",
173                                });
174         checkGolden(name, result);
175     }
176
177     //-----------------------------------------------------------------
178
179     public void testSingleLang_EN() throws Exception {
180         wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
181     }
182
183     public void testSingleLang_DE() throws Exception {
184         wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
185     }
186
187     public void testSingleLang_IT() throws Exception {
188         wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
189     }
190
191     public void testSingleLang_FR() throws Exception {
192         wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100);
193     }
194
195     public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
196         final File result = new File(TEST_OUTPUTS + name);
197         System.out.println("Writing to: " + result);
198         DictionaryBuilder.main(new String[] {
199                                    "--dictOut=" + result.getAbsolutePath(),
200                                    "--lang1=" + langCode,
201                                    "--lang1Stoplist=" + STOPLISTS + "empty.txt",
202                                    "--dictInfo=SomeWikiDataWholeSection",
203                                    "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
204                                    "--input4Name=" + name,
205                                    "--input4Format=" + WholeSectionToHtmlParser.NAME,
206                                    "--input4WiktionaryLang=" + langCode,
207                                    "--input4SkipLang=" + langCode,
208                                    "--input4TitleIndex=" + "1",
209                                    "--input4PageLimit=" + pageLimit,
210                                    "--print=" + result.getPath() + ".text",
211                                });
212         checkGolden(name, result);
213     }
214
215     //-----------------------------------------------------------------
216
217     public void testWiktionary_IT_EN() throws Exception {
218         wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
219                                    "EN.data", "enwiktionary.english", "Italian", "it", 1000);
220     }
221
222     public void testWiktionary_cmn_EN() throws Exception {
223         wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt",
224                                    // These missing "e" prevents a complete match, forcing the name to be printed
225                                    "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000);
226     }
227
228     public void testWiktionary_DE_EN() throws Exception {
229         wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
230                                    "EN.data", "enwiktionary.english", "German", "de", 1000);
231     }
232
233     public void testWiktionary_IT_IT() throws Exception {
234         wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
235                                    "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
236     }
237
238     // French
239     public void testWiktionary_FR_FR() throws Exception {
240         wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
241                                    "FR.data", "enwiktionary.french", "French", "fr", 1000);
242     }
243
244
245     // Arabic
246     public void testWiktionary_AR_AR() throws Exception {
247         // Arabic is really big for some reason, use fewer pages.
248         wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
249                                    "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
250     }
251
252     // Chinese
253     public void testWiktionary_cmn_cmn() throws Exception {
254         wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt",
255                                    // These missing "e" prevents a complete match, forcing the name to be printed.
256                                    "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000);
257     }
258
259     // German
260     public void testWiktionary_DE_DE() throws Exception {
261         wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
262                                    "DE.data", "enwiktionary.german", "German", "de", 1000);
263     }
264
265     // Thai
266     public void testWiktionary_TH_TH() throws Exception {
267         wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
268                                    // These missing "e" prevents a complete match, forcing the name to be printed.
269                                    "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
270     }
271
272     public void wiktionaryTestWithLangToEn(final String name, final String lang1,
273                                            final String stoplist, final String data, final String dictName,
274                                            final String langPattern, final String langCode, int pageLimit) throws Exception {
275         final File result = new File(TEST_OUTPUTS + name);
276         System.out.println("Writing to: " + result);
277         final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
278         DictionaryBuilder.main(new String[] {
279                                    "--dictOut=" + result.getAbsolutePath(),
280                                    "--lang1=" + lang1,
281                                    "--lang2=EN",
282                                    "--lang1Stoplist=" + STOPLISTS + stoplist,
283                                    "--lang2Stoplist=" + STOPLISTS + "en.txt",
284                                    "--dictInfo=SomeWikiData",
285
286                                    "--input4=" + WIKISPLIT_EN + data,
287                                    "--input4Name=" + dictName,
288                                    "--input4Format=enwiktionary",
289                                    "--input4WiktionaryType=" + type,
290                                    "--input4LangPattern=" + langPattern,
291                                    "--input4LangCodePattern=" + langCode,
292                                    "--input4EnIndex=2",
293                                    "--input4PageLimit=" + pageLimit,
294
295                                    "--print=" + result.getPath() + ".text",
296                                });
297
298         checkGolden(name, result);
299     }
300
301     public void testGermanCombined() throws Exception {
302         final String name = "de-en.quickdic";
303         final File result = new File(TEST_OUTPUTS + name);
304         System.out.println("Writing to: " + result);
305         DictionaryBuilder.main(new String[] {
306                                    "--dictOut=" + result.getAbsolutePath(),
307                                    "--lang1=DE",
308                                    "--lang2=EN",
309                                    "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
310
311                                    "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
312                                    "--input1Name=chemnitz",
313                                    "--input1Charset=UTF8",
314                                    "--input1Format=chemnitz",
315
316                                    "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
317                                    "--input2Name=dictcc",
318                                    "--input2Charset=UTF8",
319                                    "--input2Format=tab_separated",
320
321                                    "--print=" + result.getPath() + ".text",
322                                });
323
324         checkGolden(name, result);
325     }
326
327     public void testItalianTurkish() throws Exception {
328         final String name = "it-tr_dictcc.quickdic";
329         final File result = new File(TEST_OUTPUTS + name);
330         System.out.println("Writing to: " + result);
331         DictionaryBuilder.main(new String[] {
332                                    "--dictOut=" + result.getAbsolutePath(),
333                                    "--lang1=IT",
334                                    "--lang2=TR",
335                                    "--dictInfo=it-tr_dictcc_simulated",
336
337                                    "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt",
338                                    "--input1Name=dictcc",
339                                    "--input1Charset=UTF8",
340                                    "--input1Format=tab_separated",
341
342                                    "--print=" + result.getPath() + ".text",
343                                });
344
345         checkGolden(name, result);
346     }
347
348     private void checkGolden(final String dictName, final File dictFile)
349     throws IOException {
350         // Check it once:
351         assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
352
353         // Check it again.
354         final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r").getChannel());
355         final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
356         dict.print(out);
357         out.close();
358         assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
359     }
360
361
362     void assertFilesEqual(final String expected, final String actual) throws IOException {
363         final String expectedString = FileUtil.readToString(new File(expected));
364         final String actualString = FileUtil.readToString(new File(actual));
365         assertEquals(expectedString, actualString);
366     }
367
368
369 }