2 *******************************************************************************
\r
3 * Copyright (C) 2002-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
9 * Port From: ICU4C v2.1 : collate/CollationRegressionTest
\r
10 * Source File: $ICU4CRoot/source/test/intltest/regcoll.cpp
\r
13 package com.ibm.icu.dev.test.collator;
\r
15 import java.io.BufferedReader;
\r
16 import java.io.IOException;
\r
17 import java.util.Arrays;
\r
18 import java.util.Comparator;
\r
19 import java.util.Locale;
\r
21 import com.ibm.icu.dev.test.TestFmwk;
\r
22 import com.ibm.icu.dev.test.TestUtil;
\r
23 import com.ibm.icu.text.CollationElementIterator;
\r
24 import com.ibm.icu.text.CollationKey;
\r
25 import com.ibm.icu.text.Collator;
\r
26 import com.ibm.icu.text.RuleBasedCollator;
\r
28 public class CollationThaiTest extends TestFmwk {
\r
30 final int MAX_FAILURES_TO_SHOW = -1;
\r
32 public static void main(String[] args) throws Exception {
\r
33 new CollationThaiTest().run(args);
\r
37 * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
\r
38 * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
\r
40 public void TestCornerCases() {
\r
42 // Shorter words precede longer
\r
43 "\u0e01", "<", "\u0e01\u0e01",
\r
45 // Tone marks are considered after letters (i.e. are primary ignorable)
\r
46 "\u0e01\u0e32", "<", "\u0e01\u0e49\u0e32",
\r
48 // ditto for other over-marks
\r
49 "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e4c",
\r
51 // commonly used mark-in-context order.
\r
52 // In effect, marks are sorted after each syllable.
\r
53 "\u0e01\u0e32\u0e01\u0e49\u0e32", "<", "\u0e01\u0e48\u0e32\u0e01\u0e49\u0e32",
\r
55 // Hyphens and other punctuation follow whitespace but come before letters
\r
56 "\u0e01\u0e32", "<", "\u0e01\u0e32-",
\r
57 "\u0e01\u0e32-", "<", "\u0e01\u0e32\u0e01\u0e32",
\r
59 // Doubler follows an indentical word without the doubler
\r
60 "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e46",
\r
61 "\u0e01\u0e32\u0e46", "<", "\u0e01\u0e32\u0e01\u0e32",
\r
63 // \u0e45 after either \u0e24 or \u0e26 is treated as a single
\r
64 // combining character, similar to "c < ch" in traditional spanish.
\r
65 // TODO: beef up this case
\r
66 "\u0e24\u0e29\u0e35", "<", "\u0e24\u0e45\u0e29\u0e35",
\r
67 "\u0e26\u0e29\u0e35", "<", "\u0e26\u0e45\u0e29\u0e35",
\r
69 // Vowels reorder, should compare \u0e2d and \u0e34
\r
70 "\u0e40\u0e01\u0e2d", "<", "\u0e40\u0e01\u0e34",
\r
72 // Tones are compared after the rest of the word (e.g. primary ignorable)
\r
73 "\u0e01\u0e32\u0e01\u0e48\u0e32", "<", "\u0e01\u0e49\u0e32\u0e01\u0e32",
\r
75 // Periods are ignored entirely
\r
76 "\u0e01.\u0e01.", "<", "\u0e01\u0e32",
\r
79 RuleBasedCollator coll = null;
\r
81 coll = getThaiCollator();
\r
82 } catch (Exception e) {
\r
83 warnln("could not construct Thai collator");
\r
86 compareArray(coll, TESTS);
\r
89 void compareArray(RuleBasedCollator c, String[] tests) {
\r
90 for (int i = 0; i < tests.length; i += 3) {
\r
92 if (tests[i+1].equals("<")) {
\r
94 } else if (tests[i+1].equals(">")) {
\r
96 } else if (tests[i+1].equals("=")) {
\r
99 // expect = Integer.decode(tests[i+1]).intValue();
\r
100 errln("Error: unknown operator " + tests[i+1]);
\r
103 String s1 = tests[i];
\r
104 String s2 = tests[i+2];
\r
105 CollationTest.doTest(this, c, s1, s2, expect);
\r
110 if (i < 0) return -1;
\r
111 if (i > 0) return 1;
\r
116 * Read the external dictionary file, which is already in proper
\r
117 * sorted order, and confirm that the collator compares each line as
\r
118 * preceding the following line.
\r
120 public void TestDictionary() {
\r
121 RuleBasedCollator coll = null;
\r
123 coll = getThaiCollator();
\r
124 } catch (Exception e) {
\r
125 warnln("could not construct Thai collator");
\r
129 // Read in a dictionary of Thai words
\r
130 BufferedReader in = null;
\r
131 String fileName = "riwords.txt";
\r
133 in = TestUtil.getDataReader(fileName, "UTF-8");
\r
134 } catch (SecurityException e) {
\r
135 warnln("Security exception encountered reading test data file.");
\r
137 } catch (Exception e) {
\r
142 } catch (IOException ioe) {}
\r
143 errln("Error: could not open test file: " + fileName
\r
144 + ". Aborting test.");
\r
149 // Loop through each word in the dictionary and compare it to the previous
\r
150 // word. They should be in sorted order.
\r
152 String lastWord = "";
\r
157 String word = in.readLine();
\r
158 while (word != null) {
\r
161 // Skip comments and blank lines
\r
162 if (word.length() == 0 || word.charAt(0) == 0x23) {
\r
163 word = in.readLine();
\r
167 // Show the first 8 words being compared, so we can see what's happening
\r
169 if (wordCount <= 8) {
\r
170 logln("Word " + wordCount + ": " + word);
\r
173 if (lastWord.length() > 0) {
\r
174 CollationTest.doTest(this, coll, lastWord, word, -1);
\r
175 int result = coll.compare(lastWord, word);
\r
179 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
\r
180 String msg = "--------------------------------------------\n"
\r
182 + " compare(" + lastWord
\r
183 + ", " + word + ") returned " + result
\r
184 + ", expected -1\n";
\r
185 CollationKey k1, k2;
\r
187 k1 = coll.getCollationKey(lastWord);
\r
188 k2 = coll.getCollationKey(word);
\r
189 } catch (Exception e) {
\r
190 errln("Fail: getCollationKey returned ");
\r
193 msg += "key1: " + prettify(k1) + "\n"
\r
194 + "key2: " + prettify(k2);
\r
200 word = in.readLine();
\r
202 } catch (IOException e) {
\r
203 errln("IOException " + e.getMessage());
\r
207 if (failed > MAX_FAILURES_TO_SHOW) {
\r
208 errln("Too many failures; only the first " +
\r
209 MAX_FAILURES_TO_SHOW + " failures were shown");
\r
211 errln("Summary: " + failed + " of " + (line - 1) +
\r
212 " comparisons failed");
\r
215 logln("Words checked: " + wordCount);
\r
218 public void TestInvalidThai()
\r
220 String tests[] = { "\u0E44\u0E01\u0E44\u0E01",
\r
221 "\u0E44\u0E01\u0E01\u0E44",
\r
222 "\u0E01\u0E44\u0E01\u0E44",
\r
223 "\u0E01\u0E01\u0E44\u0E44",
\r
224 "\u0E44\u0E44\u0E01\u0E01",
\r
225 "\u0E01\u0E44\u0E44\u0E01",
\r
228 RuleBasedCollator collator;
\r
231 collator = getThaiCollator();
\r
232 comparator = new StrCmp();
\r
233 } catch (Exception e) {
\r
234 warnln("could not construct Thai collator");
\r
238 Arrays.sort(tests, comparator);
\r
240 for (int i = 0; i < tests.length; i ++)
\r
242 for (int j = i + 1; j < tests.length; j ++) {
\r
243 if (collator.compare(tests[i], tests[j]) > 0) {
\r
244 // inconsistency ordering found!
\r
245 errln("Inconsistent ordering between strings " + i
\r
249 CollationElementIterator iterator
\r
250 = collator.getCollationElementIterator(tests[i]);
\r
251 CollationTest.backAndForth(this, iterator);
\r
255 public void TestReordering()
\r
258 "\u0E41c\u0301", "=", "\u0E41\u0107", // composition
\r
259 "\u0E41\uD835\uDFCE", "<", "\u0E41\uD835\uDFCF", // supplementaries
\r
260 "\u0E41\uD834\uDD5F", "=", "\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
\r
261 "\u0E41\uD87E\uDC02", "=", "\u0E41\u4E41", // supplementary composition decomps to BMP
\r
262 "\u0E41\u0301", "=", "\u0E41\u0301", // unsafe (just checking backwards iteration)
\r
263 "\u0E41\u0301\u0316", "=", "\u0E41\u0316\u0301",
\r
265 "abc\u0E41c\u0301", "=", "abc\u0E41\u0107", // composition
\r
266 "abc\u0E41\uD834\uDC00", "<", "abc\u0E41\uD834\uDC01", // supplementaries
\r
267 "abc\u0E41\uD834\uDD5F", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
\r
268 "abc\u0E41\uD87E\uDC02", "=", "abc\u0E41\u4E41", // supplementary composition decomps to BMP
\r
269 "abc\u0E41\u0301", "=", "abc\u0E41\u0301", // unsafe (just checking backwards iteration)
\r
270 "abc\u0E41\u0301\u0316", "=", "abc\u0E41\u0316\u0301",
\r
272 "\u0E41c\u0301abc", "=", "\u0E41\u0107abc", // composition
\r
273 "\u0E41\uD834\uDC00abc", "<", "\u0E41\uD834\uDC01abc", // supplementaries
\r
274 "\u0E41\uD834\uDD5Fabc", "=", "\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
\r
275 "\u0E41\uD87E\uDC02abc", "=", "\u0E41\u4E41abc", // supplementary composition decomps to BMP
\r
276 "\u0E41\u0301abc", "=", "\u0E41\u0301abc", // unsafe (just checking backwards iteration)
\r
277 "\u0E41\u0301\u0316abc", "=", "\u0E41\u0316\u0301abc",
\r
279 "abc\u0E41c\u0301abc", "=", "abc\u0E41\u0107abc", // composition
\r
280 "abc\u0E41\uD834\uDC00abc", "<", "abc\u0E41\uD834\uDC01abc", // supplementaries
\r
281 "abc\u0E41\uD834\uDD5Fabc", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
\r
282 "abc\u0E41\uD87E\uDC02abc", "=", "abc\u0E41\u4E41abc", // supplementary composition decomps to BMP
\r
283 "abc\u0E41\u0301abc", "=", "abc\u0E41\u0301abc", // unsafe (just checking backwards iteration)
\r
284 "abc\u0E41\u0301\u0316abc", "=", "abc\u0E41\u0316\u0301abc",
\r
287 RuleBasedCollator collator;
\r
289 collator = (RuleBasedCollator)getThaiCollator();
\r
290 } catch (Exception e) {
\r
291 warnln("could not construct Thai collator");
\r
294 compareArray(collator, tests);
\r
296 String rule = "& c < ab";
\r
297 String testcontraction[] = { "\u0E41ab", ">", "\u0E41c"};
\r
299 collator = new RuleBasedCollator(rule);
\r
300 } catch (Exception e) {
\r
301 errln("Error: could not construct collator with rule " + rule);
\r
304 compareArray(collator, testcontraction);
\r
317 String prettify(CollationKey sourceKey) {
\r
319 byte[] bytes= sourceKey.toByteArray();
\r
320 String target = "[";
\r
322 for (i = 0; i < bytes.length; i++) {
\r
323 target += Integer.toHexString(bytes[i]);
\r
330 // private inner class -------------------------------------------------
\r
332 private static final class StrCmp implements Comparator<String>
\r
334 public int compare(String string1, String string2)
\r
336 return collator.compare(string1, string2);
\r
339 StrCmp() throws Exception
\r
341 collator = getThaiCollator();
\r
347 // private data members ------------------------------------------------
\r
349 private static RuleBasedCollator m_collator_;
\r
351 // private methods -----------------------------------------------------
\r
353 private static RuleBasedCollator getThaiCollator() throws Exception
\r
355 if (m_collator_ == null) {
\r
356 m_collator_ = (RuleBasedCollator)Collator.getInstance(
\r
357 new Locale("th", "TH", ""));
\r
359 return m_collator_;
\r