]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
417df82d420112076e0423ca1a61abf70f675780
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.RandomAccessFile;
22 import java.util.Collections;
23
24 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
25 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
26 import com.hughes.util.FileUtil;
27
28 import junit.framework.TestCase;
29
30 public class DictionaryBuilderTest extends TestCase {
31   
32   public static final String TEST_INPUTS = "testdata/inputs/";
33   public static final String WIKISPLIT = "data/inputs/wikiSplit/";
34   public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
35   public static final String STOPLISTS = "data/inputs/stoplists/";
36   public static final String GOLDENS = "testdata/goldens/";
37
38   public static final String TEST_OUTPUTS = "testdata/outputs/";
39
40   public void testItConj() throws Exception {
41       final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
42               "{{it-conj-are|accus|avere}}\n" +
43               "{{it-conj-care|pag|avere or essere}}\n" +
44               "{{it-conj-iare|studi|avere}}\n" +
45               "{{it-conj-iare-b|avvi|avere}}\n" +
46               "{{it-conj-ciare|pronunc|avere}}\n" +
47               "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
48               "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
49               "{{it-conj-ere|abbatt|avere}}\n" +
50               "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
51               "{{it-conj-ire-b|prefer|avere}}\n" +
52               "{{it-conj-urre|prod|avere}}\n" +
53               "{{it-conj-arsi|lav}}\n" +
54               "{{it-conj-ersi|abbatt}}\n" +
55               "{{it-conj-iarsi|annoi}}\n" +
56               "{{it-conj-carsi|coniug}}\n" +
57               "{{it-conj-ciarsi|affacc}}\n" +
58               "{{it-conj-irsi|vest}}\n" +
59               "{{it-conj-irsi-b|fer}}\n" +
60               "{{it-conj-ursi|rid|essere}}\n" +
61               "{{it-conj-cire|ricuc|avere}}\n" +
62               "{{it-conj-iarsi-b|riavvi|essere}}" +
63               "{{it-conj-fare|putre|avere}}\n" + 
64               "{{it-conj-cirsi|cuc|essere}}\n" +
65               "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
66               "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
67               "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
68               "{{term|verbo|verbō|for the word}}\n"
69               ;
70       final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
71       WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
72       parser.title = "dummyTitle";
73       parser.entrySource = new EntrySource(0, "dummySource", 0);
74       parser.parseSection("dummyHeading", toParse);
75       db.build();
76       
77       final String dictName = "testItConj.html";
78       final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
79       db.dictionary.print(out);
80       out.close();
81       
82       assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
83   }
84   
85   public void doTestCustomDict(final String name, final String lang1,
86       final String lang2, final String inputFile) throws Exception {
87     final File result = new File(TEST_OUTPUTS + name);
88     System.out.println("Writing to: " + result);
89     DictionaryBuilder.main(new String[] {
90         "--dictOut=" + result.getAbsolutePath(),
91         "--lang1=" + lang1,
92         "--lang2=" + lang2,
93         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
94         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
95         "--dictInfo=bleh.",
96         
97         "--input1=testdata/inputs/" + inputFile,
98         "--input1Name=my_input_" + name,
99         "--input1Charset=ISO-8859-1",
100         "--input1Format=tab_separated",
101
102         "--print=" + result.getPath() + ".text",
103     });
104     
105     checkGolden(name, result); 
106   }
107   
108   public void test_FR_NL() throws Exception {
109     doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
110   }
111   
112   public void testWiktionary_en_de2fr() throws Exception {
113     wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
114   }
115
116   public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
117       final String lang2) throws Exception {
118     final File result = new File(TEST_OUTPUTS + name);
119     System.out.println("Writing to: " + result);
120     DictionaryBuilder.main(new String[] {
121         "--dictOut=" + result.getAbsolutePath(),
122         "--lang1=" + lang1,
123         "--lang2=" + lang2,
124         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
125         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
126         "--dictInfo=SomeWikiDataTrans2Trans",
127
128         "--input4=" + WIKISPLIT_EN + "EN.data",
129         "--input4Name=" + name,
130         "--input4Format=" + EnTranslationToTranslationParser.NAME,
131         "--input4LangPattern1=" + lang1,
132         "--input4LangPattern2=" + lang2,
133         "--input4PageLimit=1000",
134
135         "--print=" + result.getPath() + ".text",
136     });
137     
138     checkGolden(name, result); 
139   }
140
141   public void testWiktionary_WholeSection_DE() throws Exception {
142     enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.DE.quickdic", "DE", 100);
143   }
144
145   public void testWiktionary_WholeSection_EN() throws Exception {
146     enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.EN.quickdic", "EN", 100);
147   }
148
149   public void testWiktionary_WholeSection_IT() throws Exception {
150     // Have to run to 800 to get a few verb conjugations (including essere!)
151     enWiktionaryTestWithWholeSectionToHtml("enwiktionary.WholeSection.IT.quickdic", "IT", 800);
152   }
153
154   public void enWiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
155     final File result = new File(TEST_OUTPUTS + name);
156     System.out.println("Writing to: " + result);
157     DictionaryBuilder.main(new String[] {
158         "--dictOut=" + result.getAbsolutePath(),
159         "--lang1=" + langCode,
160         "--lang2=" + "EN",
161         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
162         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
163         "--dictInfo=SomeWikiDataWholeSection",
164
165         "--input4=" + WIKISPLIT_EN + langCode + ".data",
166         "--input4Name=" + name,
167         "--input4Format=" + WholeSectionToHtmlParser.NAME,
168         "--input4WiktionaryLang=EN",
169         "--input4SkipLang=" + langCode,
170         "--input4TitleIndex=" + "1",
171         "--input4PageLimit=" + pageLimit,
172
173         "--print=" + result.getPath() + ".text",
174     });
175     checkGolden(name, result); 
176   }
177   
178   //-----------------------------------------------------------------
179
180   public void testSingleLang_EN() throws Exception {
181       wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
182   }
183
184   public void testSingleLang_DE() throws Exception {
185       wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
186   }
187
188   public void testSingleLang_IT() throws Exception {
189       wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
190   }
191
192   public void testSingleLang_FR() throws Exception {
193       wiktionaryTestSingleLang("SingleLang_FR.quickdic", "FR", 100);
194   }
195
196   public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
197       final File result = new File(TEST_OUTPUTS + name);
198       System.out.println("Writing to: " + result);
199       DictionaryBuilder.main(new String[] {
200           "--dictOut=" + result.getAbsolutePath(),
201           "--lang1=" + langCode,
202           "--lang1Stoplist=" + STOPLISTS + "empty.txt",
203           "--dictInfo=SomeWikiDataWholeSection",
204           "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
205           "--input4Name=" + name,
206           "--input4Format=" + WholeSectionToHtmlParser.NAME,
207           "--input4WiktionaryLang=" + langCode,
208           "--input4SkipLang=" + langCode,
209           "--input4TitleIndex=" + "1",
210           "--input4PageLimit=" + pageLimit,
211           "--print=" + result.getPath() + ".text",
212       });
213       checkGolden(name, result); 
214     }
215
216   //-----------------------------------------------------------------
217
218   public void testWiktionary_IT_EN() throws Exception {
219     wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
220         "EN.data", "enwiktionary.english", "Italian", "it", 1000);
221   }
222
223   public void testWiktionary_cmn_EN() throws Exception {
224     wiktionaryTestWithLangToEn("wiktionary.cmn_en.quickdic", "cmn", "empty.txt",
225         // These missing "e" prevents a complete match, forcing the name to be printed
226         "EN.data", "enwiktionary.english", "Chinese|Mandarin", "cmn", 1000);
227   }
228
229   public void testWiktionary_DE_EN() throws Exception {
230     wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
231         "EN.data", "enwiktionary.english", "German", "de", 1000);
232   }
233
234   public void testWiktionary_IT_IT() throws Exception {
235     wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
236         "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
237   }
238
239   // French
240   public void testWiktionary_FR_FR() throws Exception {
241     wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
242         "FR.data", "enwiktionary.french", "French", "fr", 1000);
243   }
244
245   
246   // Arabic
247   public void testWiktionary_AR_AR() throws Exception {
248       // Arabic is really big for some reason, use fewer pages.
249     wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
250         "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
251   }
252
253   // Chinese
254   public void testWiktionary_cmn_cmn() throws Exception {
255     wiktionaryTestWithLangToEn("wiktionary.cmn_cmn.quickdic", "cmn", "empty.txt",
256         // These missing "e" prevents a complete match, forcing the name to be printed.
257         "cmn.data", "enwiktionary.chinese", "Chinese|Mandarin", "cmn", 1000);
258   }
259
260   // German
261   public void testWiktionary_DE_DE() throws Exception {
262     wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
263         "DE.data", "enwiktionary.german", "German", "de", 1000);
264   }
265
266   // Thai
267   public void testWiktionary_TH_TH() throws Exception {
268     wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
269         // These missing "e" prevents a complete match, forcing the name to be printed.
270         "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
271   }
272
273   public void wiktionaryTestWithLangToEn(final String name, final String lang1,
274       final String stoplist, final String data, final String dictName,
275       final String langPattern, final String langCode, int pageLimit) throws Exception {
276     final File result = new File(TEST_OUTPUTS + name);
277     System.out.println("Writing to: " + result);
278     final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
279     DictionaryBuilder.main(new String[] {
280         "--dictOut=" + result.getAbsolutePath(),
281         "--lang1=" + lang1,
282         "--lang2=EN",
283         "--lang1Stoplist=" + STOPLISTS + stoplist,
284         "--lang2Stoplist=" + STOPLISTS + "en.txt",
285         "--dictInfo=SomeWikiData",
286
287         "--input4=" + WIKISPLIT_EN + data,
288         "--input4Name=" + dictName,
289         "--input4Format=enwiktionary",
290         "--input4WiktionaryType=" + type,
291         "--input4LangPattern=" + langPattern,
292         "--input4LangCodePattern=" + langCode,
293         "--input4EnIndex=2",
294         "--input4PageLimit=" + pageLimit,
295
296         "--print=" + result.getPath() + ".text",
297     });
298     
299     checkGolden(name, result); 
300   }
301
302   public void testGermanCombined() throws Exception {
303     final String name = "de-en.quickdic";
304     final File result = new File(TEST_OUTPUTS + name);
305     System.out.println("Writing to: " + result);
306     DictionaryBuilder.main(new String[] {
307         "--dictOut=" + result.getAbsolutePath(),
308         "--lang1=DE",
309         "--lang2=EN",
310         "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
311
312         "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
313         "--input1Name=chemnitz",
314         "--input1Charset=UTF8",
315         "--input1Format=chemnitz",
316
317         "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
318         "--input2Name=dictcc",
319         "--input2Charset=UTF8",
320         "--input2Format=tab_separated",
321
322         "--print=" + result.getPath() + ".text",
323     });
324     
325     checkGolden(name, result); 
326   }
327
328   public void testItalianTurkish() throws Exception {
329       final String name = "it-tr_dictcc.quickdic";
330       final File result = new File(TEST_OUTPUTS + name);
331       System.out.println("Writing to: " + result);
332       DictionaryBuilder.main(new String[] {
333           "--dictOut=" + result.getAbsolutePath(),
334           "--lang1=IT",
335           "--lang2=TR",
336           "--dictInfo=it-tr_dictcc_simulated",
337
338           "--input1=" + TEST_INPUTS + "it-tr_dictcc_simulated.txt",
339           "--input1Name=dictcc",
340           "--input1Charset=UTF8",
341           "--input1Format=tab_separated",
342
343           "--print=" + result.getPath() + ".text",
344       });
345       
346       checkGolden(name, result); 
347     }
348
349   private void checkGolden(final String dictName, final File dictFile)
350       throws IOException, FileNotFoundException {
351     // Check it once:
352     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
353
354     // Check it again.
355     final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
356     final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
357     dict.print(out);
358     out.close();
359     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
360   }
361
362
363   void assertFilesEqual(final String expected, final String actual) throws IOException {
364     final String expectedString = FileUtil.readToString(new File(expected));
365     final String actualString = FileUtil.readToString(new File(actual));
366     assertEquals(expectedString, actualString);
367   }
368
369   
370 }