2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
\r
4 * others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.normalizer;
\r
9 import java.util.Collection;
\r
10 import java.util.Iterator;
\r
11 import java.util.Set;
\r
12 import java.util.SortedSet;
\r
13 import java.util.TreeSet;
\r
15 import com.ibm.icu.dev.test.TestFmwk;
\r
16 import com.ibm.icu.impl.Utility;
\r
17 import com.ibm.icu.lang.UCharacter;
\r
18 import com.ibm.icu.text.CanonicalIterator;
\r
19 import com.ibm.icu.text.Normalizer;
\r
20 import com.ibm.icu.text.UTF16;
\r
23 // TODO: fit into test framework
\r
25 public class TestCanonicalIterator extends TestFmwk {
\r
27 static final boolean SHOW_NAMES = false;
\r
29 public static void main(String[] args) throws Exception {
\r
30 new TestCanonicalIterator().run(args);
\r
33 static final String testArray[][] = {
\r
34 {"\u00C5d\u0307\u0327", "A\u030Ad\u0307\u0327, A\u030Ad\u0327\u0307, A\u030A\u1E0B\u0327, "
\r
35 + "A\u030A\u1E11\u0307, \u00C5d\u0307\u0327, \u00C5d\u0327\u0307, "
\r
36 + "\u00C5\u1E0B\u0327, \u00C5\u1E11\u0307, \u212Bd\u0307\u0327, "
\r
37 + "\u212Bd\u0327\u0307, \u212B\u1E0B\u0327, \u212B\u1E11\u0307"},
\r
38 {"\u010d\u017E", "c\u030Cz\u030C, c\u030C\u017E, \u010Dz\u030C, \u010D\u017E"},
\r
39 {"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
\r
42 public void TestExhaustive() {
\r
44 CanonicalIterator it = new CanonicalIterator("");
\r
46 CanonicalIterator slowIt = new CanonicalIterator("");
\r
47 slowIt.SKIP_ZEROS = false;
\r
49 //Transliterator name = Transliterator.getInstance("[^\\u0020-\\u007F] name");
\r
50 //Set itSet = new TreeSet();
\r
51 //Set slowItSet = new TreeSet();
\r
54 for (int i = 0; i < 0x10FFFF; ++i) {
\r
56 // skip characters we know don't have decomps
\r
57 int type = UCharacter.getType(i);
\r
58 if (type == Character.UNASSIGNED || type == Character.PRIVATE_USE
\r
59 || type == Character.SURROGATE) continue;
\r
61 if ((++counter % 5000) == 0) logln("Testing " + Utility.hex(i,0));
\r
63 String s = UTF16.valueOf(i);
\r
64 characterTest(s, i, it);
\r
66 characterTest(s + "\u0345", i, it);
\r
70 public int TestSpeed() {
\r
71 // skip unless verbose
\r
72 if (!isVerbose()) return 0;
\r
74 String s = "\uAC01\u0345";
\r
76 CanonicalIterator it = new CanonicalIterator(s);
\r
78 int x = 0; // just to keep code from optimizing away.
\r
79 int iterations = 10000;
\r
80 double slowDelta = 0;
\r
83 CanonicalIterator slowIt = new CanonicalIterator(s);
\r
84 slowIt.SKIP_ZEROS = false;
\r
86 start = System.currentTimeMillis();
\r
87 for (int i = 0; i < iterations; ++i) {
\r
88 slowIt.setSource(s);
\r
90 String item = slowIt.next();
\r
91 if (item == null) break;
\r
95 end = System.currentTimeMillis();
\r
96 double slowDelta = (end-start) / iterations;
\r
97 logln("Slow iteration: " + slowDelta);
\r
100 start = System.currentTimeMillis();
\r
101 for (int i = 0; i < iterations; ++i) {
\r
104 String item = it.next();
\r
105 if (item == null) break;
\r
106 x += item.length();
\r
109 end = System.currentTimeMillis();
\r
110 double fastDelta = (end-start) / iterations;
\r
111 logln("Fast iteration: " + fastDelta + (slowDelta != 0 ? ", " + (fastDelta/slowDelta) : ""));
\r
117 public void TestBasic() {
\r
118 // This is not interesting anymore as the data is already built
\r
122 // UnicodeSet ss = CanonicalIterator.getSafeStart();
\r
123 // logln("Safe Start: " + ss.toPattern(true));
\r
124 // ss = CanonicalIterator.getStarts('a');
\r
125 // expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
\r
126 // new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
\r
127 // + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
\r
131 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
\r
133 Set results = new TreeSet();
\r
134 CanonicalIterator.permute("ABC", false, results);
\r
135 expectEqual("Simple permutation ", "", collectionToString(results), "ABC, ACB, BAC, BCA, CAB, CBA");
\r
138 SortedSet set = new TreeSet();
\r
139 for (int i = 0; i < testArray.length; ++i) {
\r
140 //logln("Results for: " + name.transliterate(testArray[i]));
\r
141 CanonicalIterator it = new CanonicalIterator(testArray[i][0]);
\r
142 // int counter = 0;
\r
144 String first = null;
\r
146 String result = it.next();
\r
150 if (result == null) break;
\r
151 set.add(result); // sort them
\r
152 //logln(++counter + ": " + hex.transliterate(result));
\r
153 //logln(" = " + name.transliterate(result));
\r
155 expectEqual(i + ": ", testArray[i][0], collectionToString(set), testArray[i][1]);
\r
157 if(!it.next().equals(first)){
\r
158 errln("CanonicalIterator.reset() failed");
\r
160 if(!it.getSource().equals(Normalizer.normalize(testArray[i][0],Normalizer.NFD))){
\r
161 errln("CanonicalIterator.getSource() does not return NFD of input source");
\r
166 public void expectEqual(String message, String item, Object a, Object b) {
\r
167 if (!a.equals(b)) {
\r
168 errln("FAIL: " + message + getReadable(item));
\r
169 errln("\t" + getReadable(a));
\r
170 errln("\t" + getReadable(b));
\r
172 logln("Checked: " + message + getReadable(item));
\r
173 logln("\t" + getReadable(a));
\r
174 logln("\t" + getReadable(b));
\r
178 //Transliterator name = null;
\r
179 //Transliterator hex = null;
\r
181 public String getReadable(Object obj) {
\r
182 if (obj == null) return "null";
\r
183 String s = obj.toString();
\r
184 if (s.length() == 0) return "";
\r
185 // set up for readable display
\r
186 //if (name == null) name = Transliterator.getInstance("[^\\ -\\u007F] name");
\r
187 //if (hex == null) hex = Transliterator.getInstance("[^\\ -\\u007F] hex");
\r
188 return "[" + (SHOW_NAMES ? hex(s) + "; " : "") + hex(s) + "]";
\r
191 public void characterTest(String s, int ch, CanonicalIterator it)
\r
193 int mixedCounter = 0;
\r
194 int lastMixedCounter = -1;
\r
195 boolean gotDecomp = false;
\r
196 boolean gotComp = false;
\r
197 boolean gotSource = false;
\r
198 String decomp = Normalizer.decompose(s, false);
\r
199 String comp = Normalizer.compose(s, false);
\r
201 // skip characters that don't have either decomp.
\r
202 // need quick test for this!
\r
203 if (s.equals(decomp) && s.equals(comp)) return;
\r
208 String item = it.next();
\r
209 if (item == null) break;
\r
210 if (item.equals(s)) gotSource = true;
\r
211 if (item.equals(decomp)) gotDecomp = true;
\r
212 if (item.equals(comp)) gotComp = true;
\r
213 if ((mixedCounter & 0x7F) == 0 && (ch < 0xAD00 || ch > 0xAC00 + 11172)) {
\r
214 if (lastMixedCounter != mixedCounter) {
\r
216 lastMixedCounter = mixedCounter;
\r
218 logln("\t" + mixedCounter + "\t" + hex(item)
\r
219 + (item.equals(s) ? "\t(*original*)" : "")
\r
220 + (item.equals(decomp) ? "\t(*decomp*)" : "")
\r
221 + (item.equals(comp) ? "\t(*comp*)" : "")
\r
227 // check that zeros optimization doesn't mess up.
\r
233 String item = it.next();
\r
234 if (item == null) break;
\r
237 slowIt.setSource(s);
\r
240 String item = slowIt.next();
\r
241 if (item == null) break;
\r
242 slowItSet.add(item);
\r
244 if (!itSet.equals(slowItSet)) {
\r
245 errln("Zero optimization failure with " + getReadable(s));
\r
251 if (!gotSource || !gotDecomp || !gotComp) {
\r
252 errln("FAIL CanonicalIterator: " + s + " decomp: " +decomp+" comp: "+comp);
\r
254 for(String item=it.next();item!=null;item=it.next()){
\r
261 static String collectionToString(Collection col) {
\r
262 StringBuffer result = new StringBuffer();
\r
263 Iterator it = col.iterator();
\r
264 while (it.hasNext()) {
\r
265 if (result.length() != 0) result.append(", ");
\r
266 result.append(it.next().toString());
\r
268 return result.toString();
\r