]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
Added simple parsing logic for DE and IT wiktionaries.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.RandomAccessFile;
22 import java.util.Collections;
23
24 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
25 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
26 import com.hughes.util.FileUtil;
27
28 import junit.framework.TestCase;
29
30 public class DictionaryBuilderTest extends TestCase {
31   
32   public static final String TEST_INPUTS = "testdata/inputs/";
33   public static final String WIKISPLIT = "data/inputs/wikiSplit/";
34   public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
35   public static final String STOPLISTS = "data/inputs/stoplists/";
36   public static final String GOLDENS = "testdata/goldens/";
37
38   public static final String TEST_OUTPUTS = "testdata/outputs/";
39
40   public void testItConj() throws Exception {
41       final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
42               "{{it-conj-are|accus|avere}}\n" +
43               "{{it-conj-care|pag|avere or essere}}\n" +
44               "{{it-conj-iare|studi|avere}}\n" +
45               "{{it-conj-iare-b|avvi|avere}}\n" +
46               "{{it-conj-ciare|pronunc|avere}}\n" +
47               "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
48               "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
49               "{{it-conj-ere|abbatt|avere}}\n" +
50               "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
51               "{{it-conj-ire-b|prefer|avere}}\n" +
52               "{{it-conj-urre|prod|avere}}\n" +
53               "{{it-conj-arsi|lav}}\n" +
54               "{{it-conj-ersi|abbatt}}\n" +
55               "{{it-conj-iarsi|annoi}}\n" +
56               "{{it-conj-carsi|coniug}}\n" +
57               "{{it-conj-ciarsi|affacc}}\n" +
58               "{{it-conj-irsi|vest}}\n" +
59               "{{it-conj-irsi-b|fer}}\n" +
60               "{{it-conj-ursi|rid|essere}}\n" +
61               "{{it-conj-cire|ricuc|avere}}\n" +
62               "{{it-conj-iarsi-b|riavvi|essere}}" +
63               "{{it-conj-fare|putre|avere}}\n" + 
64               "{{it-conj-cirsi|cuc|essere}}\n" +
65               "{{it-conj-ere|smett|avere|pastp=smesso|prem1s=smisi|prem3s=smise|prem3s2=''|prem3p=smisero|prem3p2=''}}\n" +
66               "{{term||[[cor#Latin|Cor]] [[Carolus#Latin|Carolī]]|Charles' heart}}\n" +
67               "{{term|sc=Grek|λόγος|tr=lógos||word}}\n" +
68               "{{term|verbo|verbō|for the word}}\n"
69               ;
70       final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
71       WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), null, "EN", "IT", "http://en.wiktionary.org/wiki/%s");
72       parser.title = "dummyTitle";
73       parser.entrySource = new EntrySource(0, "dummySource", 0);
74       parser.parseSection("dummyHeading", toParse);
75       db.build();
76       
77       final String dictName = "testItConj.html";
78       final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
79       db.dictionary.print(out);
80       out.close();
81       
82       assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
83   }
84   
85   public void doTestCustomDict(final String name, final String lang1,
86       final String lang2, final String inputFile) throws Exception {
87     final File result = new File(TEST_OUTPUTS + name);
88     System.out.println("Writing to: " + result);
89     DictionaryBuilder.main(new String[] {
90         "--dictOut=" + result.getAbsolutePath(),
91         "--lang1=" + lang1,
92         "--lang2=" + lang2,
93         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
94         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
95         "--dictInfo=bleh.",
96         
97         "--input1=testdata/inputs/" + inputFile,
98         "--input1Name=my_input_" + name,
99         "--input1Charset=ISO-8859-1",
100         "--input1Format=tab_separated",
101
102         "--print=" + result.getPath() + ".text",
103     });
104     
105     checkGolden(name, result); 
106   }
107   
108   public void test_FR_NL() throws Exception {
109     doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
110   }
111   
112   public void testWiktionary_en_de2fr() throws Exception {
113     wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
114   }
115
116   public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
117       final String lang2) throws Exception {
118     final File result = new File(TEST_OUTPUTS + name);
119     System.out.println("Writing to: " + result);
120     DictionaryBuilder.main(new String[] {
121         "--dictOut=" + result.getAbsolutePath(),
122         "--lang1=" + lang1,
123         "--lang2=" + lang2,
124         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
125         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
126         "--dictInfo=SomeWikiDataTrans2Trans",
127
128         "--input4=" + WIKISPLIT_EN + "EN.data",
129         "--input4Name=" + name,
130         "--input4Format=" + EnTranslationToTranslationParser.NAME,
131         "--input4LangPattern1=" + lang1,
132         "--input4LangPattern2=" + lang2,
133         "--input4PageLimit=1000",
134
135         "--print=" + result.getPath() + ".text",
136     });
137     
138     checkGolden(name, result); 
139   }
140
141   public void testWiktionary_WholeSection_DE() throws Exception {
142     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.DE.quickdic", "DE", 100);
143   }
144
145   public void testWiktionary_WholeSection_EN() throws Exception {
146     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.EN.quickdic", "EN", 100);
147   }
148
149   public void testWiktionary_WholeSection_IT() throws Exception {
150     // Have to run to 800 to get a few verb conjugations (including essere!)
151     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.IT.quickdic", "IT", 800);
152   }
153
154   public void wiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
155     final File result = new File(TEST_OUTPUTS + name);
156     System.out.println("Writing to: " + result);
157     DictionaryBuilder.main(new String[] {
158         "--dictOut=" + result.getAbsolutePath(),
159         "--lang1=" + langCode,
160         "--lang2=" + "EN",
161         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
162         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
163         "--dictInfo=SomeWikiDataWholeSection",
164
165         "--input4=" + WIKISPLIT_EN + langCode + ".data",
166         "--input4Name=" + name,
167         "--input4Format=" + WholeSectionToHtmlParser.NAME,
168         "--input4WiktionaryLang=EN",
169         "--input4SkipLang=" + langCode,
170         "--input4TitleIndex=" + "1",
171         "--input4PageLimit=" + pageLimit,
172
173         "--print=" + result.getPath() + ".text",
174     });
175     checkGolden(name, result); 
176   }
177   
178   //-----------------------------------------------------------------
179
180   public void testSingleLang_EN() throws Exception {
181       wiktionaryTestSingleLang("SingleLang_EN.quickdic", "EN", 100);
182   }
183
184   public void testSingleLang_DE() throws Exception {
185       wiktionaryTestSingleLang("SingleLang_DE.quickdic", "DE", 100);
186   }
187
188   public void testSingleLang_IT() throws Exception {
189       wiktionaryTestSingleLang("SingleLang_IT.quickdic", "IT", 100);
190   }
191
192   public void wiktionaryTestSingleLang(final String name, final String langCode, final int pageLimit) throws Exception {
193       final File result = new File(TEST_OUTPUTS + name);
194       System.out.println("Writing to: " + result);
195       DictionaryBuilder.main(new String[] {
196           "--dictOut=" + result.getAbsolutePath(),
197           "--lang1=" + langCode,
198           "--lang1Stoplist=" + STOPLISTS + "empty.txt",
199           "--dictInfo=SomeWikiDataWholeSection",
200           "--input4=" + WIKISPLIT + langCode.toLowerCase() + "/" + langCode + ".data",
201           "--input4Name=" + name,
202           "--input4Format=" + WholeSectionToHtmlParser.NAME,
203           "--input4WiktionaryLang=" + langCode,
204           "--input4SkipLang=" + langCode,
205           "--input4TitleIndex=" + "1",
206           "--input4PageLimit=" + pageLimit,
207           "--print=" + result.getPath() + ".text",
208       });
209       checkGolden(name, result); 
210     }
211
212   //-----------------------------------------------------------------
213
214   public void testWiktionary_IT_EN() throws Exception {
215     wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
216         "EN.data", "enwiktionary.english", "Italian", "it", 1000);
217   }
218
219   public void testWiktionary_ZH_EN() throws Exception {
220     wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt",
221         // These missing "e" prevents a complete match, forcing the name to be printed
222         "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantones", "zh", 1000);
223   }
224
225   public void testWiktionary_DE_EN() throws Exception {
226     wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
227         "EN.data", "enwiktionary.english", "German", "de", 1000);
228   }
229
230   public void testWiktionary_IT_IT() throws Exception {
231     wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
232         "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
233   }
234
235   // French
236   public void testWiktionary_FR_FR() throws Exception {
237     wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
238         "FR.data", "enwiktionary.french", "French", "fr", 1000);
239   }
240
241   
242   // Arabic
243   public void testWiktionary_AR_AR() throws Exception {
244       // Arabic is really big for some reason, use fewer pages.
245     wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
246         "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
247   }
248
249   // Chinese
250   public void testWiktionary_ZH_ZH() throws Exception {
251     wiktionaryTestWithLangToEn("wiktionary.zh_zh.quickdic", "ZH", "empty.txt",
252         // These missing "e" prevents a complete match, forcing the name to be printed.
253         "ZH.data", "enwiktionary.chinese", "Chinese|Mandarin|Cantones", "zh", 1000);
254   }
255
256   // German
257   public void testWiktionary_DE_DE() throws Exception {
258     wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
259         "DE.data", "enwiktionary.german", "German", "de", 1000);
260   }
261
262   // Thai
263   public void testWiktionary_TH_TH() throws Exception {
264     wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
265         // These missing "e" prevents a complete match, forcing the name to be printed.
266         "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
267   }
268
269   public void wiktionaryTestWithLangToEn(final String name, final String lang1,
270       final String stoplist, final String data, final String dictName,
271       final String langPattern, final String langCode, int pageLimit) throws Exception {
272     final File result = new File(TEST_OUTPUTS + name);
273     System.out.println("Writing to: " + result);
274     final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
275     DictionaryBuilder.main(new String[] {
276         "--dictOut=" + result.getAbsolutePath(),
277         "--lang1=" + lang1,
278         "--lang2=EN",
279         "--lang1Stoplist=" + STOPLISTS + stoplist,
280         "--lang2Stoplist=" + STOPLISTS + "en.txt",
281         "--dictInfo=SomeWikiData",
282
283         "--input4=" + WIKISPLIT_EN + data,
284         "--input4Name=" + dictName,
285         "--input4Format=enwiktionary",
286         "--input4WiktionaryType=" + type,
287         "--input4LangPattern=" + langPattern,
288         "--input4LangCodePattern=" + langCode,
289         "--input4EnIndex=2",
290         "--input4PageLimit=" + pageLimit,
291
292         "--print=" + result.getPath() + ".text",
293     });
294     
295     checkGolden(name, result); 
296   }
297
298   public void testGermanCombined() throws Exception {
299     final String name = "de-en.quickdic";
300     final File result = new File(TEST_OUTPUTS + name);
301     System.out.println("Writing to: " + result);
302     DictionaryBuilder.main(new String[] {
303         "--dictOut=" + result.getAbsolutePath(),
304         "--lang1=DE",
305         "--lang2=EN",
306         "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
307
308         "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
309         "--input1Name=chemnitz",
310         "--input1Charset=UTF8",
311         "--input1Format=chemnitz",
312
313         "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
314         "--input2Name=dictcc",
315         "--input2Charset=UTF8",
316         "--input2Format=tab_separated",
317
318         "--print=" + result.getPath() + ".text",
319     });
320     
321     checkGolden(name, result); 
322   }
323
324   private void checkGolden(final String dictName, final File dictFile)
325       throws IOException, FileNotFoundException {
326     // Check it once:
327     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
328
329     // Check it again.
330     final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
331     final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
332     dict.print(out);
333     out.close();
334     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
335   }
336
337
338   void assertFilesEqual(final String expected, final String actual) throws IOException {
339     final String expectedString = FileUtil.readToString(new File(expected));
340     final String actualString = FileUtil.readToString(new File(actual));
341     assertEquals(expectedString, actualString);
342   }
343
344   
345 }