2 *******************************************************************************
3 * Copyright (C) 2009-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test.text;
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 import java.io.Reader;
12 import java.text.ParseException;
13 import java.util.LinkedHashSet;
15 import java.util.regex.Matcher;
16 import java.util.regex.Pattern;
18 import com.ibm.icu.dev.test.TestFmwk;
19 import com.ibm.icu.dev.test.TestUtil;
20 import com.ibm.icu.dev.test.TestUtil.JavaVendor;
21 import com.ibm.icu.impl.Utility;
22 import com.ibm.icu.text.Normalizer2;
23 import com.ibm.icu.text.SpoofChecker;
24 import com.ibm.icu.text.UnicodeSet;
25 import com.ibm.icu.util.ULocale;
27 public class SpoofCheckerTest extends TestFmwk {
29 public static void main(String[] args) throws Exception {
30 new SpoofCheckerTest().run(args);
33 void TEST_ASSERT(boolean expr) {
34 if ((expr) == false) {
35 errln("Assertion Failure.\n");
39 void TEST_ASSERT_EQ(int a, int b) {
41 errln(String.format("Test Failure: %d != %d\n", a, b));
45 void TEST_ASSERT_NE(Object a, Object b) {
47 errln(String.format("Test Failure: (%s) == (%s) \n", a.toString(), b.toString()));
53 * Identifiers for verifying that spoof checking is minimally alive and working.
55 char[] goodLatinChars = { (char) 0x75, (char) 0x7a };
56 String goodLatin = new String(goodLatinChars); /* "uz", all ASCII */
57 /* (not confusable) */
58 char[] scMixedChars = { (char) 0x73, (char) 0x0441 };
59 String scMixed = new String(scMixedChars); /* "sc", with Cyrillic 'c' */
60 /* (mixed script, confusable */
62 String scLatin = "sc"; /* "sc", plain ascii. */
63 String goodCyrl = "\u0438\u043B"; // "Cyrillic small letter i and el" Plain lower case Cyrillic letters, no latin confusables
64 String goodGreek = "\u03c0\u03c6"; // "Greek small letter pi and phi" Plain lower case Greek letters
66 // Various 1 l I look-alikes
67 String lll_Latin_a = "lI1"; // small letter l, cap I, digit 1, all ASCII
68 // "\uFF29\u217C\u0196" Full-width I, Small Roman Numeral fifty, Latin Cap Letter IOTA
69 String lll_Latin_b = "\uff29\u217c\u0196";
70 String lll_Cyrl = "\u0406\u04C0\u0031"; // "\u0406\u04C01"
71 /* The skeleton transform for all of the 'lll' lookalikes is ascii lower case letter l. */
72 String lll_Skel = "lll";
74 String han_Hiragana = "\u3086\u308A \u77F3\u7530"; // Hiragana, space, Han
78 * Test basic constructor.
80 public void TestUSpoof() {
81 SpoofChecker sc = new SpoofChecker.Builder().build();
83 errln("FAIL: null SpoofChecker");
88 * Test build from source rules.
90 public void TestOpenFromSourceRules() {
91 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
92 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
93 logln("Skip this test case because of the IBM Java 5 bug");
98 Reader confusablesWholeScript;
101 fileName = "unicode/confusables.txt";
102 confusables = TestUtil.getDataReader(fileName, "UTF-8");
103 fileName = "unicode/confusablesWholeScript.txt";
104 confusablesWholeScript = TestUtil.getDataReader(fileName, "UTF-8");
106 SpoofChecker rsc = new SpoofChecker.Builder().setData(confusables, confusablesWholeScript).build();
108 errln("FAIL: null SpoofChecker");
111 // Check that newly built-from-rules SpoofChecker is able to function.
113 } catch (java.io.IOException e) {
115 } catch (ParseException e) {
121 * Set & Get Check Flags
123 public void TestGetSetChecks1() {
124 SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS).build();
127 TEST_ASSERT_EQ(t, SpoofChecker.ALL_CHECKS);
129 sc = new SpoofChecker.Builder().setChecks(0).build();
131 TEST_ASSERT_EQ(0, t);
133 int checks = SpoofChecker.WHOLE_SCRIPT_CONFUSABLE | SpoofChecker.MIXED_SCRIPT_CONFUSABLE
134 | SpoofChecker.ANY_CASE;
135 sc = new SpoofChecker.Builder().setChecks(checks).build();
137 TEST_ASSERT_EQ(checks, t);
141 * get & setAllowedChars
143 public void TestGetSetAllowedChars() {
144 SpoofChecker sc = new SpoofChecker.Builder().build();
148 uset = sc.getAllowedChars();
149 TEST_ASSERT(uset.isFrozen());
150 us = new UnicodeSet((int) 0x41, (int) 0x5A); /* [A-Z] */
151 sc = new SpoofChecker.Builder().setAllowedChars(us).build();
152 TEST_ASSERT_NE(us, sc.getAllowedChars());
153 TEST_ASSERT(us.equals(sc.getAllowedChars()));
159 public void TestGetSetChecks() {
160 SpoofChecker sc = new SpoofChecker.Builder().build();
163 boolean checkResults;
165 checks = sc.getChecks();
166 TEST_ASSERT_EQ(SpoofChecker.ALL_CHECKS, checks);
168 checks &= ~(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.MIXED_SCRIPT_CONFUSABLE);
169 sc = new SpoofChecker.Builder().setChecks(checks).build();
170 checks2 = sc.getChecks();
171 TEST_ASSERT_EQ(checks, checks2);
174 * The checks that were disabled just above are the same ones that the "scMixed" test fails. So with those tests
175 * gone checking that Identifier should now succeed
177 checkResults = sc.failsChecks(scMixed);
178 TEST_ASSERT(false == checkResults);
184 public void TestAllowedLocales() {
185 SpoofChecker sc = new SpoofChecker.Builder().build();
186 Set<ULocale> allowedLocales = new LinkedHashSet<ULocale>();
187 boolean checkResults;
189 /* Default allowed locales list should be empty */
190 allowedLocales = sc.getAllowedLocales();
191 TEST_ASSERT(allowedLocales.isEmpty());
193 /* Allow en and ru, which should enable Latin and Cyrillic only to pass */
194 ULocale enloc = new ULocale("en");
195 ULocale ruloc = new ULocale("ru_RU");
196 allowedLocales.add(enloc);
197 allowedLocales.add(ruloc);
198 sc = new SpoofChecker.Builder().setAllowedLocales(allowedLocales).build();
199 allowedLocales = sc.getAllowedLocales();
200 TEST_ASSERT(allowedLocales.contains(enloc));
201 TEST_ASSERT(allowedLocales.contains(ruloc));
204 * Limit checks to SpoofChecker.CHAR_LIMIT. Some of the test data has whole script confusables also, which we
205 * don't want to see in this test.
207 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
209 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
210 checkResults = sc.failsChecks(goodLatin);
211 TEST_ASSERT(false == checkResults);
213 checkResults = sc.failsChecks(goodGreek, result);
214 TEST_ASSERT_EQ(SpoofChecker.CHAR_LIMIT, result.checks);
216 checkResults = sc.failsChecks(goodCyrl);
217 TEST_ASSERT(false == checkResults);
219 /* Reset with an empty locale list, which should allow all characters to pass */
220 allowedLocales = new LinkedHashSet<ULocale>();
221 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
223 checkResults = sc.failsChecks(goodGreek);
224 TEST_ASSERT(false == checkResults);
228 * AllowedChars set/get the UnicodeSet of allowed characters.
230 public void TestAllowedChars() {
231 SpoofChecker sc = new SpoofChecker.Builder().build();
234 boolean checkResults;
236 /* By default, we should see no restriction; the UnicodeSet should allow all characters. */
237 set = sc.getAllowedChars();
238 tmpSet = new UnicodeSet(0, 0x10ffff);
239 TEST_ASSERT(tmpSet.equals(set));
241 /* Setting the allowed chars should enable the check. */
242 sc = new SpoofChecker.Builder().setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CHAR_LIMIT).build();
244 /* Remove a character that is in our good Latin test identifier from the allowed chars set. */
245 tmpSet.remove(goodLatin.charAt(1));
246 sc = new SpoofChecker.Builder().setAllowedChars(tmpSet).build();
248 /* Latin Identifier should now fail; other non-latin test cases should still be OK */
249 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
250 checkResults = sc.failsChecks(goodLatin, result);
251 TEST_ASSERT(checkResults);
252 TEST_ASSERT_EQ(SpoofChecker.CHAR_LIMIT, result.checks);
254 checkResults = sc.failsChecks(goodGreek, result);
255 TEST_ASSERT(checkResults);
256 TEST_ASSERT_EQ(SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, result.checks);
259 public void TestCheck() {
260 SpoofChecker sc = new SpoofChecker.Builder().build();
261 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
262 boolean checkResults;
264 result.position = 666;
265 checkResults = sc.failsChecks(goodLatin, result);
266 TEST_ASSERT(false == checkResults);
267 TEST_ASSERT_EQ(666, result.position);
269 checkResults = sc.failsChecks(goodCyrl, result);
270 TEST_ASSERT(false == checkResults);
272 result.position = 666;
273 checkResults = sc.failsChecks(scMixed, result);
274 TEST_ASSERT(true == checkResults);
275 TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
276 TEST_ASSERT_EQ(2, result.position);
278 result.position = 666;
279 checkResults = sc.failsChecks(han_Hiragana, result);
280 TEST_ASSERT(false == checkResults);
281 TEST_ASSERT_EQ(666, result.position);
282 TEST_ASSERT_EQ(0, result.checks);
285 public void TestAreConfusable1() {
286 SpoofChecker sc = new SpoofChecker.Builder().build();
288 checkResults = sc.areConfusable(scLatin, scMixed);
289 TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE, checkResults);
291 checkResults = sc.areConfusable(goodGreek, scLatin);
292 TEST_ASSERT_EQ(0, checkResults);
294 checkResults = sc.areConfusable(lll_Latin_a, lll_Latin_b);
295 TEST_ASSERT_EQ(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, checkResults);
298 public void TestGetSkeleton() {
299 SpoofChecker sc = new SpoofChecker.Builder().build();
301 dest = sc.getSkeleton(SpoofChecker.ANY_CASE, lll_Latin_a);
302 TEST_ASSERT(lll_Skel.equals(dest));
303 TEST_ASSERT_EQ(lll_Skel.length(), dest.length());
304 TEST_ASSERT_EQ(3, dest.length());
308 * IntlTestSpoof is the top level test class for the Unicode Spoof detection tests
311 // Test the USpoofDetector API functions that require C++
312 // The pure C part of the API, which is most of it, is tested in cintltst
314 * IntlTestSpoof tests for USpoofDetector
316 public void TestSpoofAPI() {
317 SpoofChecker sc = new SpoofChecker.Builder().build();
318 String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts.
319 // If this test starts failing, consult confusablesWholeScript.txt
320 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
321 result.position = 666;
322 boolean checkResults = sc.failsChecks(s, result);
323 TEST_ASSERT(false == checkResults);
324 TEST_ASSERT_EQ(666, result.position); // not changed
326 sc = new SpoofChecker.Builder().build();
328 String s2 = Utility.unescape("\\u0441\\u0445\\u0455"); // Cyrillic "cxs"
329 int checkResult = sc.areConfusable(s1, s2);
330 TEST_ASSERT_EQ(SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.WHOLE_SCRIPT_CONFUSABLE, checkResult);
332 sc = new SpoofChecker.Builder().build();
334 String dest = sc.getSkeleton(SpoofChecker.ANY_CASE, s);
335 TEST_ASSERT(dest.equals("lllOO"));
338 public void TestSkeleton() {
339 SpoofChecker sc = new SpoofChecker.Builder().build();
343 // testSkeleton. Spot check a number of confusable skeleton substitutions from the
344 // Unicode data file confusables.txt
345 // Test cases chosen for substitutions of various lengths, and
346 // membership in different mapping tables.
347 public void checkSkeleton(SpoofChecker sc) {
349 int SL = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
350 int MA = SpoofChecker.ANY_CASE;
351 int SA = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
353 // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
354 // (in the C implementation)
358 " A 1ong \\u02b9identifier' that will overflow implementation stack buffers, forcing heap allocations."
359 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
360 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
361 + " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
362 " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
363 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
364 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
365 + " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.");
367 checkSkeleton(sc, SL, "nochange", "nochange");
368 checkSkeleton(sc, MA, "love", "love");
369 checkSkeleton(sc, MA, "1ove", "love"); // Digit 1 to letter l
370 checkSkeleton(sc, ML, "OOPS", "OOPS");
371 checkSkeleton(sc, ML, "00PS", "00PS"); // Digit 0 unchanged in lower case mode.
372 checkSkeleton(sc, MA, "OOPS", "OOPS");
373 checkSkeleton(sc, MA, "00PS", "OOPS"); // Digit 0 to letter O in any case mode only
374 checkSkeleton(sc, SL, "\\u059c", "\\u0301");
375 checkSkeleton(sc, SL, "\\u2A74", "\\u003A\\u003A\\u003D");
376 checkSkeleton(sc, SL, "\\u247E", "\\u0028\\u006c\\u006c\\u0029"); // "(ll)"
377 checkSkeleton(sc, SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
379 // This mapping exists in the ML and MA tables, does not exist in SL, SA
380 // 0C83 ; 0983 ; ML # KANNADA SIGN VISARGA to
381 checkSkeleton(sc, SL, "\\u0C83", "\\u0C83");
382 checkSkeleton(sc, SA, "\\u0C83", "\\u0C83");
383 checkSkeleton(sc, ML, "\\u0C83", "\\u0983");
384 checkSkeleton(sc, MA, "\\u0C83", "\\u0983");
386 // 0391 ; 0041 ; MA # GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A
387 // This mapping exists only in the MA table.
388 checkSkeleton(sc, MA, "\\u0391", "A");
389 checkSkeleton(sc, SA, "\\u0391", "\\u0391");
390 checkSkeleton(sc, ML, "\\u0391", "\\u0391");
391 checkSkeleton(sc, SL, "\\u0391", "\\u0391");
393 // 13CF ; 0062 ; MA # CHEROKEE LETTER SI to LATIN SMALL LETTER B
394 // This mapping exists in the ML and MA tables
395 checkSkeleton(sc, ML, "\\u13CF", "b");
396 checkSkeleton(sc, MA, "\\u13CF", "b");
397 checkSkeleton(sc, SL, "\\u13CF", "\\u13CF");
398 checkSkeleton(sc, SA, "\\u13CF", "\\u13CF");
400 // 0022 ; 0027 0027 ;
402 checkSkeleton(sc, SL, "\"", "\\u0027\\u0027");
403 checkSkeleton(sc, SA, "\"", "\\u0027\\u0027");
404 checkSkeleton(sc, ML, "\"", "\\u0027\\u0027");
405 checkSkeleton(sc, MA, "\"", "\\u0027\\u0027");
408 // Internal function to run a single skeleton test case.
410 // Run a single confusable skeleton transformation test case.
412 void checkSkeleton(SpoofChecker sc, int type, String input, String expected) {
413 String uInput = Utility.unescape(input);
414 String uExpected = Utility.unescape(expected);
416 actual = sc.getSkeleton(type, uInput);
417 if (!uExpected.equals(actual)) {
418 errln("Actual and Expected skeletons differ.");
419 errln((" Actual Skeleton: \"") + actual + ("\"\n") + (" Expected Skeleton: \"") + uExpected + ("\""));
423 public void TestAreConfusable() {
424 SpoofChecker sc = new SpoofChecker.Builder().build();
425 String s1 = "A long string that will overflow stack buffers. A long string that will overflow stack buffers. "
426 + "A long string that will overflow stack buffers. A long string that will overflow stack buffers. ";
427 String s2 = "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. "
428 + "A long string that wi11 overflow stack buffers. A long string that will overflow stack buffers. ";
429 TEST_ASSERT_EQ(SpoofChecker.SINGLE_SCRIPT_CONFUSABLE, sc.areConfusable(s1, s2));
432 public void TestInvisible() {
433 SpoofChecker sc = new SpoofChecker.Builder().build();
434 String s = Utility.unescape("abcd\\u0301ef");
435 SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
436 result.position = -42;
437 TEST_ASSERT(false == sc.failsChecks(s, result));
438 TEST_ASSERT_EQ(0, result.checks);
439 TEST_ASSERT(result.position == -42); // unchanged
441 String s2 = Utility.unescape("abcd\\u0301\\u0302\\u0301ef");
442 TEST_ASSERT(true == sc.failsChecks(s2, result));
443 TEST_ASSERT_EQ(SpoofChecker.INVISIBLE, result.checks);
444 TEST_ASSERT_EQ(7, result.position);
446 // Two acute accents, one from the composed a with acute accent, \u00e1,
448 result.position = -42;
449 String s3 = Utility.unescape("abcd\\u00e1\\u0301xyz");
450 TEST_ASSERT(true == sc.failsChecks(s3, result));
451 TEST_ASSERT_EQ(SpoofChecker.INVISIBLE, result.checks);
452 TEST_ASSERT_EQ(7, result.position);
455 private String parseHex(String in) {
456 StringBuilder sb = new StringBuilder();
457 for (String oneCharAsHexString : in.split("\\s+")) {
458 if (oneCharAsHexString.length() > 0) {
459 sb.appendCodePoint(Integer.parseInt(oneCharAsHexString, 16));
462 return sb.toString();
465 private String escapeString(String in) {
466 StringBuilder out = new StringBuilder();
467 for (int i = 0; i < in.length(); i++) {
468 int c = in.codePointAt(i);
470 out.append((char) c);
471 } else if (c <= 0xffff) {
472 out.append(String.format("\\u%04x", c));
474 out.append(String.format("\\U%06x", c));
478 return out.toString();
481 // Verify that each item from the Unicode confusables.txt file
482 // transforms into the expected skeleton.
483 public void testConfData() {
484 if (TestUtil.getJavaVendor() == JavaVendor.IBM && TestUtil.getJavaVersion() == 5) {
485 // Note: IBM Java 5 has a bug reading a large UTF-8 text contents
486 logln("Skip this test case because of the IBM Java 5 bug");
490 // Read in the confusables.txt file. (Distributed by Unicode.org)
491 String fileName = "unicode/confusables.txt";
492 BufferedReader confusablesRdr = TestUtil.getDataReader(fileName, "UTF-8");
494 // Create a default spoof checker to use in this test.
495 SpoofChecker sc = new SpoofChecker.Builder().build();
497 // Parse lines from the confusables.txt file. Example Line:
498 // FF44 ; 0064 ; SL # ( d -> d ) FULLWIDTH ....
499 // Lines have three fields. The hex fields can contain more than one character,
500 // and each character may be more than 4 digits (for supplemntals)
501 // This regular expression matches lines and splits the fields into capture groups.
502 // Capture group 1: map from chars
504 // 3: table type, SL, ML, SA or MA
505 // 4: Comment Lines Only
506 // 5: Error Lines Only
507 Matcher parseLine = Pattern.compile(
508 "\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
509 + "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
511 Normalizer2 normalizer = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
514 while ((inputLine = confusablesRdr.readLine()) != null) {
516 parseLine.reset(inputLine);
517 if (!parseLine.matches()) {
518 errln("Syntax error in confusable data file at line " + lineNum);
522 if (parseLine.group(4) != null) {
523 continue; // comment line
525 String from = parseHex(parseLine.group(1));
527 if (!normalizer.isNormalized(from)) {
528 // The source character was not NFD.
529 // Skip this case; the first step in obtaining a skeleton is to NFD the input,
530 // so the mapping in this line of confusables.txt will never be applied.
534 String rawExpected = parseHex(parseLine.group(2));
535 String expected = normalizer.normalize(rawExpected);
537 int skeletonType = 0;
538 String tableType = parseLine.group(3);
539 if (tableType.equals("SL")) {
540 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE;
541 } else if (tableType.indexOf("SA") >= 0) {
542 skeletonType = SpoofChecker.SINGLE_SCRIPT_CONFUSABLE | SpoofChecker.ANY_CASE;
543 } else if (tableType.indexOf("ML") >= 0) {
545 } else if (tableType.indexOf("MA") >= 0) {
546 skeletonType = SpoofChecker.ANY_CASE;
550 actual = sc.getSkeleton(skeletonType, from);
552 if (!actual.equals(expected)) {
553 errln("confusables.txt: " + lineNum + ": " + parseLine.group(0));
554 errln("Actual: " + escapeString(actual));
557 confusablesRdr.close();
558 } catch (IOException e) {