]> gitweb.fperrin.net Git - DictionaryPC.git/blob - src/com/hughes/android/dictionary/engine/DictionaryBuilderTest.java
Link forms, page limit arabic, change HTML.
[DictionaryPC.git] / src / com / hughes / android / dictionary / engine / DictionaryBuilderTest.java
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 package com.hughes.android.dictionary.engine;
16
17 import java.io.File;
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.io.PrintStream;
21 import java.io.RandomAccessFile;
22 import java.util.Collections;
23
24 import com.hughes.android.dictionary.parser.wiktionary.EnTranslationToTranslationParser;
25 import com.hughes.android.dictionary.parser.wiktionary.WholeSectionToHtmlParser;
26 import com.hughes.util.FileUtil;
27
28 import junit.framework.TestCase;
29
30 public class DictionaryBuilderTest extends TestCase {
31   
32   public static final String TEST_INPUTS = "testdata/inputs/";
33   public static final String WIKISPLIT_EN = "data/inputs/wikiSplit/en/";
34   public static final String STOPLISTS = "data/inputs/stoplists/";
35   public static final String GOLDENS = "testdata/goldens/";
36
37   public static final String TEST_OUTPUTS = "testdata/outputs/";
38
39   public void testItConj() throws Exception {
40       final String toParse = "{{it-conj-are|d|avere|pres2s=dai|pres3s=dà|pres3p=danno|prem1s=diedi|prem1s2=detti|prem2s=desti|prem3s=diede|prem3s2=dette|prem1p=demmo|prem2p=deste|prem3p=diedero|prem3p2=dettero|fut1s=darò|fut2s=darai|fut3s=darà|fut1p=daremo|fut2p=darete|fut3p=daranno|cond1s=darei|cond2s=daresti|cond3s=darebbe|cond1p=daremmo|cond2p=dareste|cond3p=darebbero|sub123s=dia|sub3p=diano|impsub12s=dessi|impsub3s=desse|impsub1p=dessimo|impsub2p=deste|impsub3p=dessero|imp2s=dà|imp2s2=dai|imp2s3=da'|imp3s=dia|imp3p=diano}}\n" +
41               "{{it-conj-are|accus|avere}}\n" +
42               "{{it-conj-care|pag|avere or essere}}\n" +
43               "{{it-conj-iare|studi|avere}}\n" +
44               "{{it-conj-iare-b|avvi|avere}}\n" +
45               "{{it-conj-ciare|pronunc|avere}}\n" +
46               "{{it-conj-ere|sed|essere|pres1s=siedo|pres1s2=seggo|pres2s=siedi|pres3s=siede|pres3p=siedono|pres3p2=seggono|fut1s2=siederò|fut2s2=siederai|fut3s2=siederà|fut1p2=siederemo|fut2p2=siederete|fut3p2=siederanno|cond1s2=siederei|cond2s2=siederesti|cond3s2=siederebbe|cond1p2=siederemmo|cond2p2=siedereste|cond3p2=siederebbero|sub123s=sieda|sub3p=siedano|imp2s=siedi|imp3s=sieda|imp3s2=segga|imp3p=siedano|imp3p2=seggano}}\n" +
47               "{{it-conj-ere|persuad|avere|pastp=persuaso|prem1s=persuasi|prem3s=persuase|prem3s2=''|prem3p=persuasero|prem3p2=''}}\n" +
48               "{{it-conj-ere|abbatt|avere}}\n" +
49               "{{it-conj-ire|copr|avere|pastp=coperto|prem1s2=copersi|prem3s2=coperse|prem3p2=copersero}}\n" +
50               "{{it-conj-ire-b|prefer|avere}}\n" +
51               "{{it-conj-urre|prod|avere}}\n" +
52               "{{it-conj-arsi|lav}}\n" +
53               "{{it-conj-ersi|abbatt}}\n" +
54               "{{it-conj-iarsi|annoi}}\n" +
55               "{{it-conj-carsi|coniug}}\n" +
56               "{{it-conj-ciarsi|affacc}}\n" +
57               "{{it-conj-irsi|vest}}\n" +
58               "{{it-conj-irsi-b|fer}}\n" +
59               "{{it-conj-ursi|rid|essere}}\n" +
60               "{{it-conj-cire|ricuc|avere}}\n" +
61               "{{it-conj-iarsi-b|riavvi|essere}}" +
62               "{{it-conj-fare|putre|avere}}\n" + 
63               "{{it-conj-cirsi|cuc|essere}}\n"
64               ;
65       final DictionaryBuilder db = new DictionaryBuilder("", Language.en, Language.it,  "", "", Collections.singleton("X"), Collections.singleton("X"));
66       WholeSectionToHtmlParser parser = new WholeSectionToHtmlParser(db.indexBuilders.get(0), "EN", "IT");
67       parser.title = "dummyTitle";
68       parser.entrySource = new EntrySource(0, "dummySource", 0);
69       parser.parseSection("dummyHeading", toParse);
70       db.build();
71       
72       final String dictName = "testItConj.html";
73       final PrintStream out = new PrintStream(new File(TEST_OUTPUTS, dictName));
74       db.dictionary.print(out);
75       out.close();
76       
77       assertFilesEqual(GOLDENS + dictName, TEST_OUTPUTS + dictName);
78   }
79   
80   public void doTestCustomDict(final String name, final String lang1,
81       final String lang2, final String inputFile) throws Exception {
82     final File result = new File(TEST_OUTPUTS + name);
83     System.out.println("Writing to: " + result);
84     DictionaryBuilder.main(new String[] {
85         "--dictOut=" + result.getAbsolutePath(),
86         "--lang1=" + lang1,
87         "--lang2=" + lang2,
88         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
89         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
90         "--dictInfo=bleh.",
91         
92         "--input1=testdata/inputs/" + inputFile,
93         "--input1Name=my_input_" + name,
94         "--input1Charset=ISO-8859-1",
95         "--input1Format=tab_separated",
96
97         "--print=" + result.getPath() + ".text",
98     });
99     
100     checkGolden(name, result); 
101   }
102   
103   public void test_FR_NL() throws Exception {
104     doTestCustomDict("QuickDic-FR-NL.quickdic", "FR", "NL", "QuickDic-FR-NL.txt");
105   }
106   
107   public void testWiktionary_en_de2fr() throws Exception {
108     wiktionaryTestWithEnTrans2Trans("wiktionary.de_fr.quickdic", "DE", "FR");
109   }
110
111   public void wiktionaryTestWithEnTrans2Trans(final String name, final String lang1,
112       final String lang2) throws Exception {
113     final File result = new File(TEST_OUTPUTS + name);
114     System.out.println("Writing to: " + result);
115     DictionaryBuilder.main(new String[] {
116         "--dictOut=" + result.getAbsolutePath(),
117         "--lang1=" + lang1,
118         "--lang2=" + lang2,
119         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
120         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
121         "--dictInfo=SomeWikiDataTrans2Trans",
122
123         "--input4=" + WIKISPLIT_EN + "EN.data",
124         "--input4Name=" + name,
125         "--input4Format=" + EnTranslationToTranslationParser.NAME,
126         "--input4LangPattern1=" + lang1,
127         "--input4LangPattern2=" + lang2,
128         "--input4PageLimit=1000",
129
130         "--print=" + result.getPath() + ".text",
131     });
132     
133     checkGolden(name, result); 
134   }
135
136   public void testWiktionary_WholeSection_DE() throws Exception {
137     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.DE.quickdic", "DE", 100);
138   }
139
140   public void testWiktionary_WholeSection_EN() throws Exception {
141     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.EN.quickdic", "EN", 100);
142   }
143
144   public void testWiktionary_WholeSection_IT() throws Exception {
145     // Have to run to 800 to get a few verb conjugations (including essere!)
146     wiktionaryTestWithWholeSectionToHtml("wiktionary.WholeSection.IT.quickdic", "IT", 800);
147   }
148
149   public void wiktionaryTestWithWholeSectionToHtml(final String name, final String langCode, final int pageLimit) throws Exception {
150     final File result = new File(TEST_OUTPUTS + name);
151     System.out.println("Writing to: " + result);
152     DictionaryBuilder.main(new String[] {
153         "--dictOut=" + result.getAbsolutePath(),
154         "--lang1=" + langCode,
155         "--lang2=" + "EN",
156         "--lang1Stoplist=" + STOPLISTS + "empty.txt",
157         "--lang2Stoplist=" + STOPLISTS + "empty.txt",
158         "--dictInfo=SomeWikiDataWholeSection",
159
160         "--input4=" + WIKISPLIT_EN + langCode + ".data",
161         "--input4Name=" + name,
162         "--input4Format=" + WholeSectionToHtmlParser.NAME,
163         "--input4WiktionaryLang=EN",
164         "--input4SkipLang=" + langCode,
165         "--input4TitleIndex=" + "1",
166         "--input4PageLimit=" + pageLimit,
167
168         "--print=" + result.getPath() + ".text",
169     });
170     checkGolden(name, result); 
171   }
172
173   
174   public void testWiktionary_IT_EN() throws Exception {
175     wiktionaryTestWithLangToEn("wiktionary.it_en.quickdic", "IT", "it.txt",
176         "EN.data", "enwiktionary.english", "Italian", "it", 1000);
177   }
178
179   public void testWiktionary_ZH_EN() throws Exception {
180     wiktionaryTestWithLangToEn("wiktionary.zh_en.quickdic", "ZH", "empty.txt",
181         // These missing "e" prevents a complete match, forcing the name to be printed
182         "EN.data", "enwiktionary.english", "Chinese|Mandarin|Cantones", "zh", 1000);
183   }
184
185   public void testWiktionary_DE_EN() throws Exception {
186     wiktionaryTestWithLangToEn("wiktionary.de_en.quickdic", "DE", "de.txt",
187         "EN.data", "enwiktionary.english", "German", "de", 1000);
188   }
189
190   public void testWiktionary_IT_IT() throws Exception {
191     wiktionaryTestWithLangToEn("wiktionary.it_it.quickdic", "IT", "it.txt",
192         "IT.data", "enwiktionary.italian", "Italian", "it", 1000);
193   }
194
195   // French
196   public void testWiktionary_FR_FR() throws Exception {
197     wiktionaryTestWithLangToEn("wiktionary.fr_fr.quickdic", "FR", "fr.txt",
198         "FR.data", "enwiktionary.french", "French", "fr", 1000);
199   }
200
201   
202   // Arabic
203   public void testWiktionary_AR_AR() throws Exception {
204       // Arabic is really big for some reason, use fewer pages.
205     wiktionaryTestWithLangToEn("wiktionary.ar_ar.quickdic", "AR", "empty.txt",
206         "AR.data", "enwiktionary.arabic", "Arabic", "ar", 200);
207   }
208
209   // Chinese
210   public void testWiktionary_ZH_ZH() throws Exception {
211     wiktionaryTestWithLangToEn("wiktionary.zh_zh.quickdic", "ZH", "empty.txt",
212         // These missing "e" prevents a complete match, forcing the name to be printed.
213         "ZH.data", "enwiktionary.chinese", "Chinese|Mandarin|Cantones", "zh", 1000);
214   }
215
216   // German
217   public void testWiktionary_DE_DE() throws Exception {
218     wiktionaryTestWithLangToEn("wiktionary.de_de.quickdic", "DE", "de.txt",
219         "DE.data", "enwiktionary.german", "German", "de", 1000);
220   }
221
222   // Thai
223   public void testWiktionary_TH_TH() throws Exception {
224     wiktionaryTestWithLangToEn("wiktionary.th_th.quickdic", "TH", "empty.txt",
225         // These missing "e" prevents a complete match, forcing the name to be printed.
226         "TH.data", "enwiktionary.thai", "Thai", "th", 1000);
227   }
228
229   public void wiktionaryTestWithLangToEn(final String name, final String lang1,
230       final String stoplist, final String data, final String dictName,
231       final String langPattern, final String langCode, int pageLimit) throws Exception {
232     final File result = new File(TEST_OUTPUTS + name);
233     System.out.println("Writing to: " + result);
234     final String type = data.equals("EN.data") ? "EnToTranslation" : "EnForeign";
235     DictionaryBuilder.main(new String[] {
236         "--dictOut=" + result.getAbsolutePath(),
237         "--lang1=" + lang1,
238         "--lang2=EN",
239         "--lang1Stoplist=" + STOPLISTS + stoplist,
240         "--lang2Stoplist=" + STOPLISTS + "en.txt",
241         "--dictInfo=SomeWikiData",
242
243         "--input4=" + WIKISPLIT_EN + data,
244         "--input4Name=" + dictName,
245         "--input4Format=enwiktionary",
246         "--input4WiktionaryType=" + type,
247         "--input4LangPattern=" + langPattern,
248         "--input4LangCodePattern=" + langCode,
249         "--input4EnIndex=2",
250         "--input4PageLimit=" + pageLimit,
251
252         "--print=" + result.getPath() + ".text",
253     });
254     
255     checkGolden(name, result); 
256   }
257
258   public void testGermanCombined() throws Exception {
259     final String name = "de-en.quickdic";
260     final File result = new File(TEST_OUTPUTS + name);
261     System.out.println("Writing to: " + result);
262     DictionaryBuilder.main(new String[] {
263         "--dictOut=" + result.getAbsolutePath(),
264         "--lang1=DE",
265         "--lang2=EN",
266         "--dictInfo=@" + TEST_INPUTS + "de-en_dictInfo.txt",
267
268         "--input1=" + TEST_INPUTS + "de-en_chemnitz_100",
269         "--input1Name=chemnitz",
270         "--input1Charset=UTF8",
271         "--input1Format=chemnitz",
272
273         "--input2=" + TEST_INPUTS + "de-en_dictcc_simulated",
274         "--input2Name=dictcc",
275         "--input2Charset=UTF8",
276         "--input2Format=tab_separated",
277
278         "--print=" + result.getPath() + ".text",
279     });
280     
281     checkGolden(name, result); 
282   }
283
284   private void checkGolden(final String dictName, final File dictFile)
285       throws IOException, FileNotFoundException {
286     // Check it once:
287     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
288
289     // Check it again.
290     final Dictionary dict = new Dictionary(new RandomAccessFile(dictFile.getAbsolutePath(), "r"));
291     final PrintStream out = new PrintStream(new File(dictFile.getPath() + ".text"));
292     dict.print(out);
293     out.close();
294     assertFilesEqual(GOLDENS + dictName + ".text", dictFile.getPath() + ".text");
295   }
296
297
298   void assertFilesEqual(final String expected, final String actual) throws IOException {
299     final String expectedString = FileUtil.readToString(new File(expected));
300     final String actualString = FileUtil.readToString(new File(actual));
301     assertEquals(expectedString, actualString);
302   }
303
304   
305 }