]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
Fixed trailing ,s in italian verb tenses.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.RandomAccessFile;
22 import java.util.Collections;
23
24 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
25 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
26 import com.hughes.util.FileUtil;
27
28 import junit.framework.TestCase;
29
30 public class DictionaryBuilderTest extends TestCase {
31   
32   public static final String TEST_INPUTS = "testdata/inputs/";
33   public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
34   public static final String STOPLISTS = "data/inputs/stoplists/";
35   public static final String GOLDENS = "testdata/goldens/";
36
37   public static final String TEST_OUTPUTS = "testdata/outputs/";
38
39   public void testItConj() throws Exception {
40       final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
41               "{{it-conj-are|accus|avere}}\n" +
42               "{{it-conj-care|pag|avere or essere}}\n" +
43               "{{it-conj-iare|studi|avere}}\n" +
44               "{{it-conj-iare-b|avvi|avere}}\n" +
45               "{{it-conj-ciare|pronunc|avere}}\n" +
46               "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
47               "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
48               "{{it-conj-ere|abbatt|avere}}\n" +
49               "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
50               "{{it-conj-ire-b|prefer|avere}}\n" +
51               "{{it-conj-urre|prod|avere}}\n" +
52               "{{it-conj-arsi|lav}}\n" +
53               "{{it-conj-ersi|abbatt}}\n" +
54               "{{it-conj-iarsi|annoi}}\n" +
55               "{{it-conj-carsi|coniug}}\n" +
56               "{{it-conj-ciarsi|affacc}}\n" +
57               "{{it-conj-irsi|vest}}\n" +
58               "{{it-conj-irsi-b|fer}}\n" +
59               "{{it-conj-ursi|rid|essere}}\n" +
60               "{{it-conj-cire|ricuc|avere}}\n" +
61               "{{it-conj-iarsi-b|riavvi|essere}}" +
62               "{{it-conj-fare|putre|avere}}\n" + 
63               "{{it-conj-cirsi|cuc|essere}}\n" +
64               "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n"
65               ;
66       final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
67       WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
68       parser.title = "dummyTitle";
69       parser.entrySource = new EntrySource(0, "dummySource", 0);
70       parser.parseSection("dummyHeading", toParse);
71       db.build();
72       
73       final String dictName = "testItConj.html";
74       final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
75       db.dictionary.print(out);
76       out.close();
77       
78       assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
79   }
80   
81   public void doTestCustomDict(final String name, final String lang1,
82       final String lang2, final String inputFile) throws Exception {
83     final File result = new File(TEST_OUTPUTS + name);
84     System.out.println("Writing to: " + result);
85     DictionaryBuilder.main(new String[] {
86         "--dictOut=" + result.getAbsolutePath(),
87         "--lang1=" + lang1,
88         "--lang2=" + lang2,
89         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
90         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
91         "--dictInfo=bleh.",
92         
93         "--input1=testdata/inputs/" + inputFile,
94         "--input1Name=my_input_" + name,
95         "--input1Charset=ISO-8859-1",
96         "--input1Format=tab_separated",
97
98         "--print=" + result.getPath() + ".text",
99     });
100     
101     checkGolden(name, result); 
102   }
103   
104   public void test_FR_NL() throws Exception {
105     doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
106   }
107   
108   public void testWiktionary_en_de2fr() throws Exception {
109     wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
110   }
111
112   public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
113       final String lang2) throws Exception {
114     final File result = new File(TEST_OUTPUTS + name);
115     System.out.println("Writing to: " + result);
116     DictionaryBuilder.main(new String[] {
117         "--dictOut=" + result.getAbsolutePath(),
118         "--lang1=" + lang1,
119         "--lang2=" + lang2,
120         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
121         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
122         "--dictInfo=SomeWikiDataTrans2Trans",
123
124         "--input4=" + WIKISPLIT_EN + "EN.data",
125         "--input4Name=" + name,
126         "--input4Format=" + EnTranslationToTranslationParser.NAME,
127         "--input4LangPattern1=" + lang1,
128         "--input4LangPattern2=" + lang2,
129         "--input4PageLimit=1000",
130
131         "--print=" + result.getPath() + ".text",
132     });
133     
134     checkGolden(name, result); 
135   }
136
137   public void testWiktionary_WholeSection_DE() throws Exception {
138     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.DE.quickdic", "DE", 100);
139   }
140
141   public void testWiktionary_WholeSection_EN() throws Exception {
142     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.EN.quickdic", "EN", 100);
143   }
144
145   public void testWiktionary_WholeSection_IT() throws Exception {
146     // Have to run to 800 to get a few verb conjugations (including essere!)
147     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.IT.quickdic", "IT", 800);
148   }
149
150   public void wiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
151     final File result = new File(TEST_OUTPUTS + name);
152     System.out.println("Writing to: " + result);
153     DictionaryBuilder.main(new String[] {
154         "--dictOut=" + result.getAbsolutePath(),
155         "--lang1=" + langCode,
156         "--lang2=" + "EN",
157         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
158         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
159         "--dictInfo=SomeWikiDataWholeSection",
160
161         "--input4=" + WIKISPLIT_EN + langCode + ".data",
162         "--input4Name=" + name,
163         "--input4Format=" + WholeSectionToHtmlParser.NAME,
164         "--input4WiktionaryLang=EN",
165         "--input4SkipLang=" + langCode,
166         "--input4TitleIndex=" + "1",
167         "--input4PageLimit=" + pageLimit,
168
169         "--print=" + result.getPath() + ".text",
170     });
171     checkGolden(name, result); 
172   }
173
174   
175   public void testWiktionary_IT_EN() throws Exception {
176     wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
177         "EN.data", "enwiktionary.english", "Italian", "it", 1000);
178   }
179
180   public void testWiktionary_ZH_EN() throws Exception {
181     wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt",
182         // These missing "e" prevents a complete match, forcing the name to be printed
183         "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantones", "zh", 1000);
184   }
185
186   public void testWiktionary_DE_EN() throws Exception {
187     wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
188         "EN.data", "enwiktionary.english", "German", "de", 1000);
189   }
190
191   public void testWiktionary_IT_IT() throws Exception {
192     wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
193         "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
194   }
195
196   // French
197   public void testWiktionary_FR_FR() throws Exception {
198     wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
199         "FR.data", "enwiktionary.french", "French", "fr", 1000);
200   }
201
202   
203   // Arabic
204   public void testWiktionary_AR_AR() throws Exception {
205       // Arabic is really big for some reason, use fewer pages.
206     wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
207         "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
208   }
209
210   // Chinese
211   public void testWiktionary_ZH_ZH() throws Exception {
212     wiktionaryTestWithLangToEn("wiktionary.zh_zh.quickdic", "ZH", "empty.txt",
213         // These missing "e" prevents a complete match, forcing the name to be printed.
214         "ZH.data", "enwiktionary.chinese", "Chinese|Mandarin|Cantones", "zh", 1000);
215   }
216
217   // German
218   public void testWiktionary_DE_DE() throws Exception {
219     wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
220         "DE.data", "enwiktionary.german", "German", "de", 1000);
221   }
222
223   // Thai
224   public void testWiktionary_TH_TH() throws Exception {
225     wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
226         // These missing "e" prevents a complete match, forcing the name to be printed.
227         "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
228   }
229
230   public void wiktionaryTestWithLangToEn(final String name, final String lang1,
231       final String stoplist, final String data, final String dictName,
232       final String langPattern, final String langCode, int pageLimit) throws Exception {
233     final File result = new File(TEST_OUTPUTS + name);
234     System.out.println("Writing to: " + result);
235     final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
236     DictionaryBuilder.main(new String[] {
237         "--dictOut=" + result.getAbsolutePath(),
238         "--lang1=" + lang1,
239         "--lang2=EN",
240         "--lang1Stoplist=" + STOPLISTS + stoplist,
241         "--lang2Stoplist=" + STOPLISTS + "en.txt",
242         "--dictInfo=SomeWikiData",
243
244         "--input4=" + WIKISPLIT_EN + data,
245         "--input4Name=" + dictName,
246         "--input4Format=enwiktionary",
247         "--input4WiktionaryType=" + type,
248         "--input4LangPattern=" + langPattern,
249         "--input4LangCodePattern=" + langCode,
250         "--input4EnIndex=2",
251         "--input4PageLimit=" + pageLimit,
252
253         "--print=" + result.getPath() + ".text",
254     });
255     
256     checkGolden(name, result); 
257   }
258
259   public void testGermanCombined() throws Exception {
260     final String name = "de-en.quickdic";
261     final File result = new File(TEST_OUTPUTS + name);
262     System.out.println("Writing to: " + result);
263     DictionaryBuilder.main(new String[] {
264         "--dictOut=" + result.getAbsolutePath(),
265         "--lang1=DE",
266         "--lang2=EN",
267         "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
268
269         "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
270         "--input1Name=chemnitz",
271         "--input1Charset=UTF8",
272         "--input1Format=chemnitz",
273
274         "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
275         "--input2Name=dictcc",
276         "--input2Charset=UTF8",
277         "--input2Format=tab_separated",
278
279         "--print=" + result.getPath() + ".text",
280     });
281     
282     checkGolden(name, result); 
283   }
284
285   private void checkGolden(final String dictName, final File dictFile)
286       throws IOException, FileNotFoundException {
287     // Check it once:
288     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
289
290     // Check it again.
291     final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
292     final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
293     dict.print(out);
294     out.close();
295     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
296   }
297
298
299   void assertFilesEqual(final String expected, final String actual) throws IOException {
300     final String expectedString = FileUtil.readToString(new File(expected));
301     final String actualString = FileUtil.readToString(new File(actual));
302     assertEquals(expectedString, actualString);
303   }
304
305   
306 }