2 *******************************************************************************
\r
3 * Copyright (C) 2002-2005, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
9 * Port From: ICU4C v2.1 : collate/CollationRegressionTest
\r
10 * Source File: $ICU4CRoot/source/test/intltest/regcoll.cpp
\r
13 package com.ibm.icu.dev.test.collator;
\r
15 import com.ibm.icu.dev.test.*;
\r
16 import com.ibm.icu.text.*;
\r
17 import java.util.Locale;
\r
18 import java.util.Comparator;
\r
19 import java.util.Arrays;
\r
22 public class CollationThaiTest extends TestFmwk {
\r
24 final int MAX_FAILURES_TO_SHOW = -1;
\r
26 public static void main(String[] args) throws Exception {
\r
27 new CollationThaiTest().run(args);
\r
31 * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort",
\r
32 * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip
\r
34 public void TestCornerCases() {
\r
36 // Shorter words precede longer
\r
37 "\u0e01", "<", "\u0e01\u0e01",
\r
39 // Tone marks are considered after letters (i.e. are primary ignorable)
\r
40 "\u0e01\u0e32", "<", "\u0e01\u0e49\u0e32",
\r
42 // ditto for other over-marks
\r
43 "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e4c",
\r
45 // commonly used mark-in-context order.
\r
46 // In effect, marks are sorted after each syllable.
\r
47 "\u0e01\u0e32\u0e01\u0e49\u0e32", "<", "\u0e01\u0e48\u0e32\u0e01\u0e49\u0e32",
\r
49 // Hyphens and other punctuation follow whitespace but come before letters
\r
50 "\u0e01\u0e32", "<", "\u0e01\u0e32-",
\r
51 "\u0e01\u0e32-", "<", "\u0e01\u0e32\u0e01\u0e32",
\r
53 // Doubler follows an indentical word without the doubler
\r
54 "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e46",
\r
55 "\u0e01\u0e32\u0e46", "<", "\u0e01\u0e32\u0e01\u0e32",
\r
57 // \u0e45 after either \u0e24 or \u0e26 is treated as a single
\r
58 // combining character, similar to "c < ch" in traditional spanish.
\r
59 // TODO: beef up this case
\r
60 "\u0e24\u0e29\u0e35", "<", "\u0e24\u0e45\u0e29\u0e35",
\r
61 "\u0e26\u0e29\u0e35", "<", "\u0e26\u0e45\u0e29\u0e35",
\r
63 // Vowels reorder, should compare \u0e2d and \u0e34
\r
64 "\u0e40\u0e01\u0e2d", "<", "\u0e40\u0e01\u0e34",
\r
66 // Tones are compared after the rest of the word (e.g. primary ignorable)
\r
67 "\u0e01\u0e32\u0e01\u0e48\u0e32", "<", "\u0e01\u0e49\u0e32\u0e01\u0e32",
\r
69 // Periods are ignored entirely
\r
70 "\u0e01.\u0e01.", "<", "\u0e01\u0e32",
\r
73 RuleBasedCollator coll = null;
\r
75 coll = getThaiCollator();
\r
76 } catch (Exception e) {
\r
77 warnln("could not construct Thai collator");
\r
80 compareArray(coll, TESTS);
\r
83 void compareArray(RuleBasedCollator c, String[] tests) {
\r
84 for (int i = 0; i < tests.length; i += 3) {
\r
86 if (tests[i+1].equals("<")) {
\r
88 } else if (tests[i+1].equals(">")) {
\r
90 } else if (tests[i+1].equals("=")) {
\r
93 // expect = Integer.decode(tests[i+1]).intValue();
\r
94 errln("Error: unknown operator " + tests[i+1]);
\r
97 String s1 = tests[i];
\r
98 String s2 = tests[i+2];
\r
99 CollationTest.doTest(this, c, s1, s2, expect);
\r
104 if (i < 0) return -1;
\r
105 if (i > 0) return 1;
\r
110 * Read the external dictionary file, which is already in proper
\r
111 * sorted order, and confirm that the collator compares each line as
\r
112 * preceding the following line.
\r
114 public void TestDictionary() {
\r
115 RuleBasedCollator coll = null;
\r
117 coll = getThaiCollator();
\r
118 } catch (Exception e) {
\r
119 warnln("could not construct Thai collator");
\r
123 // Read in a dictionary of Thai words
\r
124 BufferedReader in = null;
\r
125 String fileName = "riwords.txt";
\r
127 in = TestUtil.getDataReader(fileName, "UTF-8");
\r
128 } catch (SecurityException e) {
\r
129 warnln("Security exception encountered reading test data file.");
\r
131 } catch (Exception e) {
\r
136 } catch (IOException ioe) {}
\r
137 errln("Error: could not open test file: " + fileName
\r
138 + ". Aborting test.");
\r
143 // Loop through each word in the dictionary and compare it to the previous
\r
144 // word. They should be in sorted order.
\r
146 String lastWord = "";
\r
151 String word = in.readLine();
\r
152 while (word != null) {
\r
155 // Skip comments and blank lines
\r
156 if (word.length() == 0 || word.charAt(0) == 0x23) {
\r
157 word = in.readLine();
\r
161 // Show the first 8 words being compared, so we can see what's happening
\r
163 if (wordCount <= 8) {
\r
164 logln("Word " + wordCount + ": " + word);
\r
167 if (lastWord.length() > 0) {
\r
168 CollationTest.doTest(this, coll, lastWord, word, -1);
\r
169 int result = coll.compare(lastWord, word);
\r
173 if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) {
\r
174 String msg = "--------------------------------------------\n"
\r
176 + " compare(" + lastWord
\r
177 + ", " + word + ") returned " + result
\r
178 + ", expected -1\n";
\r
179 CollationKey k1, k2;
\r
181 k1 = coll.getCollationKey(lastWord);
\r
182 k2 = coll.getCollationKey(word);
\r
183 } catch (Exception e) {
\r
184 errln("Fail: getCollationKey returned ");
\r
187 msg += "key1: " + prettify(k1) + "\n"
\r
188 + "key2: " + prettify(k2);
\r
194 word = in.readLine();
\r
196 } catch (IOException e) {
\r
197 errln("IOException " + e.getMessage());
\r
201 if (failed > MAX_FAILURES_TO_SHOW) {
\r
202 errln("Too many failures; only the first " +
\r
203 MAX_FAILURES_TO_SHOW + " failures were shown");
\r
205 errln("Summary: " + failed + " of " + (line - 1) +
\r
206 " comparisons failed");
\r
209 logln("Words checked: " + wordCount);
\r
212 public void TestInvalidThai()
\r
214 String tests[] = { "\u0E44\u0E01\u0E44\u0E01",
\r
215 "\u0E44\u0E01\u0E01\u0E44",
\r
216 "\u0E01\u0E44\u0E01\u0E44",
\r
217 "\u0E01\u0E01\u0E44\u0E44",
\r
218 "\u0E44\u0E44\u0E01\u0E01",
\r
219 "\u0E01\u0E44\u0E44\u0E01",
\r
222 RuleBasedCollator collator;
\r
225 collator = getThaiCollator();
\r
226 comparator = new StrCmp();
\r
227 } catch (Exception e) {
\r
228 warnln("could not construct Thai collator");
\r
232 Arrays.sort(tests, comparator);
\r
234 for (int i = 0; i < tests.length; i ++)
\r
236 for (int j = i + 1; j < tests.length; j ++) {
\r
237 if (collator.compare(tests[i], tests[j]) > 0) {
\r
238 // inconsistency ordering found!
\r
239 errln("Inconsistent ordering between strings " + i
\r
243 CollationElementIterator iterator
\r
244 = collator.getCollationElementIterator(tests[i]);
\r
245 CollationTest.backAndForth(this, iterator);
\r
249 public void TestReordering()
\r
252 "\u0E41c\u0301", "=", "\u0E41\u0107", // composition
\r
253 "\u0E41\uD835\uDFCE", "<", "\u0E41\uD835\uDFCF", // supplementaries
\r
254 "\u0E41\uD834\uDD5F", "=", "\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
\r
255 "\u0E41\uD87E\uDC02", "=", "\u0E41\u4E41", // supplementary composition decomps to BMP
\r
256 "\u0E41\u0301", "=", "\u0E41\u0301", // unsafe (just checking backwards iteration)
\r
257 "\u0E41\u0301\u0316", "=", "\u0E41\u0316\u0301",
\r
259 "abc\u0E41c\u0301", "=", "abc\u0E41\u0107", // composition
\r
260 "abc\u0E41\uD834\uDC00", "<", "abc\u0E41\uD834\uDC01", // supplementaries
\r
261 "abc\u0E41\uD834\uDD5F", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary
\r
262 "abc\u0E41\uD87E\uDC02", "=", "abc\u0E41\u4E41", // supplementary composition decomps to BMP
\r
263 "abc\u0E41\u0301", "=", "abc\u0E41\u0301", // unsafe (just checking backwards iteration)
\r
264 "abc\u0E41\u0301\u0316", "=", "abc\u0E41\u0316\u0301",
\r
266 "\u0E41c\u0301abc", "=", "\u0E41\u0107abc", // composition
\r
267 "\u0E41\uD834\uDC00abc", "<", "\u0E41\uD834\uDC01abc", // supplementaries
\r
268 "\u0E41\uD834\uDD5Fabc", "=", "\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
\r
269 "\u0E41\uD87E\uDC02abc", "=", "\u0E41\u4E41abc", // supplementary composition decomps to BMP
\r
270 "\u0E41\u0301abc", "=", "\u0E41\u0301abc", // unsafe (just checking backwards iteration)
\r
271 "\u0E41\u0301\u0316abc", "=", "\u0E41\u0316\u0301abc",
\r
273 "abc\u0E41c\u0301abc", "=", "abc\u0E41\u0107abc", // composition
\r
274 "abc\u0E41\uD834\uDC00abc", "<", "abc\u0E41\uD834\uDC01abc", // supplementaries
\r
275 "abc\u0E41\uD834\uDD5Fabc", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary
\r
276 "abc\u0E41\uD87E\uDC02abc", "=", "abc\u0E41\u4E41abc", // supplementary composition decomps to BMP
\r
277 "abc\u0E41\u0301abc", "=", "abc\u0E41\u0301abc", // unsafe (just checking backwards iteration)
\r
278 "abc\u0E41\u0301\u0316abc", "=", "abc\u0E41\u0316\u0301abc",
\r
281 RuleBasedCollator collator;
\r
283 collator = (RuleBasedCollator)getThaiCollator();
\r
284 } catch (Exception e) {
\r
285 warnln("could not construct Thai collator");
\r
288 compareArray(collator, tests);
\r
290 String rule = "& c < ab";
\r
291 String testcontraction[] = { "\u0E41ab", ">", "\u0E41c"};
\r
293 collator = new RuleBasedCollator(rule);
\r
294 } catch (Exception e) {
\r
295 errln("Error: could not construct collator with rule " + rule);
\r
298 compareArray(collator, testcontraction);
\r
311 String prettify(CollationKey sourceKey) {
\r
313 byte[] bytes= sourceKey.toByteArray();
\r
314 String target = "[";
\r
316 for (i = 0; i < bytes.length; i++) {
\r
317 target += Integer.toHexString(bytes[i]);
\r
324 // private inner class -------------------------------------------------
\r
326 private static final class StrCmp implements Comparator
\r
328 public int compare(Object string1, Object string2)
\r
330 return collator.compare(string1, string2);
\r
333 StrCmp() throws Exception
\r
335 collator = getThaiCollator();
\r
341 // private data members ------------------------------------------------
\r
343 private static RuleBasedCollator m_collator_;
\r
345 // private methods -----------------------------------------------------
\r
347 private static RuleBasedCollator getThaiCollator() throws Exception
\r
349 if (m_collator_ == null) {
\r
350 m_collator_ = (RuleBasedCollator)Collator.getInstance(
\r
351 new Locale("th", "TH", ""));
\r
353 return m_collator_;
\r