2 ***************************************************************************
3 * Copyright (C) 2008-2013 International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
7 * Unicode Spoof Detection
9 package com.ibm.icu.text;
11 import java.io.BufferedInputStream;
12 import java.io.DataInputStream;
13 import java.io.DataOutputStream;
14 import java.io.IOException;
15 import java.io.InputStream;
16 import java.io.LineNumberReader;
17 import java.io.Reader;
18 import java.text.ParseException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.Comparator;
23 import java.util.HashSet;
24 import java.util.Hashtable;
25 import java.util.LinkedHashSet;
27 import java.util.Vector;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
31 import com.ibm.icu.impl.Trie2;
32 import com.ibm.icu.impl.Trie2Writable;
33 import com.ibm.icu.lang.UCharacter;
34 import com.ibm.icu.lang.UCharacterCategory;
35 import com.ibm.icu.lang.UProperty;
36 import com.ibm.icu.lang.UScript;
37 import com.ibm.icu.util.ULocale;
41 * <b>Unicode Security and Spoofing Detection.</b>
43 * <p>This class is intended to check strings, typically
44 * identifiers of some type, such as URLs, for the presence of
45 * characters that are likely to be visually confusing -
46 * for cases where the displayed form of an identifier may
47 * not be what it appears to be.
49 * <p>Unicode Technical Report #36,
50 * <a href="http://unicode.org/reports/tr36">http://unicode.org/reports/tr36</a> and
51 * Unicode Technical Standard #39,
52 * <a href="http://unicode.org/reports/tr39">http://unicode.org/reports/tr39</a>
53 * "Unicode security considerations", give more background on
54 * security and spoofing issues with Unicode identifiers.
55 * The tests and checks provided by this module implement the recommendations
56 * from these Unicode documents.
58 * <p>The tests available on identifiers fall into two general categories:
60 * <li> Single identifier tests. Check whether an identifier is
61 * potentially confusable with any other string, or is suspicious
62 * for other reasons. </li>
63 * <li> Two identifier tests. Check whether two specific identifiers are confusable.
64 * This does not consider whether either of strings is potentially
65 * confusable with any string other than the exact one specified. </li>
68 * <p>The steps to perform confusability testing are
70 * <li> Create a <code>SpoofChecker.Builder</code> </li>
71 * <li> Configure the Builder for the desired set of tests. The tests that will
72 * be performed are specified by a set of SpoofCheck flags. </li>
73 * <li> Build a <code>SpoofChecker</code> from the Builder. </li>
74 * <li> Perform the checks using the pre-configured <code>SpoofChecker</code>. The results indicate
75 * which (if any) of the selected tests have identified possible problems with the identifier.
76 * Results are reported as a set of SpoofCheck flags; this mirrors the form in which
77 * the set of tests to perform was originally specified to the SpoofChecker. </li>
80 * <p>A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number
83 * <p>Thread Safety: The methods on SpoofChecker objects are thread safe.
84 * The test functions for checking a single identifier, or for testing
85 * whether two identifiers are potentially confusable, may called concurrently
86 * from multiple threads using the same SpoofChecker instance.
89 * <p>Descriptions of the available checks.
91 * <p>When testing whether pairs of identifiers are confusable, with <code>areConfusable()</code>
92 * the relevant tests are
95 * <li> <code>SINGLE_SCRIPT_CONFUSABLE</code>: All of the characters from the two identifiers are
96 * from a single script, and the two identifiers are visually confusable.</li>
97 * <li> <code>MIXED_SCRIPT_CONFUSABLE</code>: At least one of the identifiers contains characters
98 * from more than one script, and the two identifiers are visually confusable.</li>
99 * <li> <code>WHOLE_SCRIPT_CONFUSABLE</code>: Each of the two identifiers is of a single script, but
100 * the the two identifiers are from different scripts, and they are visually confusable.</li>
103 * <p>The safest approach is to enable all three of these checks as a group.
105 * <p><code>ANY_CASE</code> is a modifier for the above tests. If the identifiers being checked can
106 * be of mixed case and are used in a case-sensitive manner, this option should be specified.
108 * <p>If the identifiers being checked are used in a case-insensitive manner, and if they are
109 * displayed to users in lower-case form only, the <code>ANY_CASE</code> option should not be
110 * specified. Confusabality issues involving upper case letters will not be reported.
112 * <p>When performing tests on a single identifier, with the check() family of functions,
113 * the relevant tests are:
116 * <li><code>MIXED_SCRIPT_CONFUSABLE</code>: the identifier contains characters from multiple
117 * scripts, and there exists an identifier of a single script that is visually confusable.</li>
118 * <li><code>WHOLE_SCRIPT_CONFUSABLE</code>: the identifier consists of characters from a single
119 * script, and there exists a visually confusable identifier.
120 * The visually confusable identifier also consists of characters from a single script.
121 * but not the same script as the identifier being checked.</li>
122 * <li><code>ANY_CASE</code>: modifies the mixed script and whole script confusables tests. If
123 * specified, the checks will find confusable characters of any case.
124 * If this flag is not set, the test is performed assuming case folded identifiers.</li>
125 * <li><code>SINGLE_SCRIPT</code>: check that the identifier contains only characters from a
126 * single script. (Characters from the <em>common</em> and <em>inherited</em> scripts are ignored.)
127 * This is not a test for confusable identifiers</li>
128 * <li><code>INVISIBLE</code>: check an identifier for the presence of invisible characters,
129 * such as zero-width spaces, or character sequences that are
130 * likely not to display, such as multiple occurrences of the same
131 * non-spacing mark. This check does not test the input string as a whole
132 * for conformance to any particular syntax for identifiers.</li>
133 * <li><code>CHAR_LIMIT</code>: check that an identifier contains only characters from a specified set
134 * of acceptable characters. See <code>Builder.setAllowedChars()</code> and
135 * <code>Builder.setAllowedLocales()</code>.</li>
138 * <p>Note on Scripts:
139 * <blockquote>Characters from the Unicode Scripts "Common" and "Inherited" are ignored when considering
140 * the script of an identifier. Common characters include digits and symbols that
141 * are normally used with text from many different scripts. </blockquote>
145 public class SpoofChecker {
148 * Constants from UAX 31 for use in setRestrictionLevel.
151 public enum RestrictionLevel {
153 * Only ASCII characters: U+0000..U+007F
159 * All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
160 * Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
161 * vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
167 * Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
171 MODERATELY_RESTRICTIVE,
173 * Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as
174 * Moderately Restrictive
178 MINIMALLY_RESTRICTIVE,
180 * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
189 * Security Profile constant from UAX 31 for use in setAllowedChars.
190 * Will probably be replaced by UnicodeSet property.
193 public static final UnicodeSet INCLUSION = new UnicodeSet("["
194 + "\\-.\\u00B7\\u05F3\\u05F4\\u0F0B\\u200C\\u200D\\u2019]");
197 * Security Profile constant from UAX 31 for use in setAllowedChars.
198 * Will probably be replaced by UnicodeSet property.
201 public static final UnicodeSet RECOMMENDED = new UnicodeSet("["
202 + "[0-z\\u00C0-\\u017E\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-"
203 + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F5\\u01F8-\\u021B\\u021E"
204 + "\\u021F\\u0226-\\u0233\\u02BB\\u02BC\\u02EC\\u0300-\\u0304"
205 + "\\u0306-\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-"
206 + "\\u0328\\u032D\\u032E\\u0330\\u0331\\u0335\\u0338\\u0339"
207 + "\\u0342-\\u0345\\u037B-\\u03CE\\u03FC-\\u045F\\u048A-\\u0525"
208 + "\\u0531-\\u0586\\u05D0-\\u05F2\\u0621-\\u063F\\u0641-\\u0655"
209 + "\\u0660-\\u0669\\u0670-\\u068D\\u068F-\\u06D5\\u06E5\\u06E6"
210 + "\\u06EE-\\u06FF\\u0750-\\u07B1\\u0901-\\u0939\\u093C-\\u094D"
211 + "\\u0950\\u0960-\\u0972\\u0979-\\u0A4D\\u0A5C-\\u0A74\\u0A81-"
212 + "\\u0B43\\u0B47-\\u0B61\\u0B66-\\u0C56\\u0C60\\u0C61\\u0C66-"
213 + "\\u0CD6\\u0CE0-\\u0CEF\\u0D02-\\u0D28\\u0D2A-\\u0D39\\u0D3D-"
214 + "\\u0D43\\u0D46-\\u0D4D\\u0D57-\\u0D61\\u0D66-\\u0D8E\\u0D91-"
215 + "\\u0DA5\\u0DA7-\\u0DDE\\u0DF2\\u0E01-\\u0ED9\\u0F00\\u0F20-"
216 + "\\u0F8B\\u0F90-\\u109D\\u10D0-\\u10F0\\u10F7-\\u10FA\\u1200-"
217 + "\\u135A\\u135F\\u1380-\\u138F\\u1401-\\u167F\\u1780-\\u17A2"
218 + "\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7-"
219 + "\\u17DC\\u17E0-\\u17E9\\u1810-\\u18A8\\u18AA-\\u18F5\\u1E00-"
220 + "\\u1E99\\u1F00-\\u1FFC\\u2D30-\\u2D65\\u2D80-\\u2DDE\\u3005-"
221 + "\\u3007\\u3041-\\u31B7\\u3400-\\u9FCB\\uA000-\\uA48C\\uA67F"
222 + "\\uA717-\\uA71F\\uA788\\uAA60-\\uAA7B\\uAC00-\\uD7A3\\uFA0E-"
223 + "\\uFA29\\U00020000-"
224 + "\\U0002B734]-[[:Cn:][:nfkcqc=n:][:XIDC=n:]]]");
228 * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
229 * checks that will be performed, and to report results from the check function.
234 * Single script confusable test. When testing whether two identifiers are confusable, report that they are if both
235 * are from the same script and they are visually confusable. Note: this test is not applicable to a check of a
240 public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
243 * Mixed script confusable test.
245 * When checking a single identifier, report a problem if the identifier contains multiple scripts, and is also
246 * confusable with some other identifier in a single script.
248 * When testing whether two identifiers are confusable, report that they are if the two IDs are visually confusable,
249 * and and at least one contains characters from more than one script.
253 public static final int MIXED_SCRIPT_CONFUSABLE = 2;
256 * Whole script confusable test.
258 * When checking a single identifier, report a problem if The identifier is of a single script, and there exists a
259 * confusable identifier in another script.
261 * When testing whether two Identifiers are confusable, report that they are if each is of a single script, the
262 * scripts of the two identifiers are different, and the identifiers are visually confusable.
266 public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
269 * Any Case Modifier for confusable identifier tests.
271 * When specified, consider all characters, of any case, when looking for confusables. If ANY_CASE is not specified,
272 * identifiers being checked are assumed to have been case folded, and upper case conusable characters will not be
277 public static final int ANY_CASE = 8;
280 * Check that an identifier is no looser than the specified RestrictionLevel.
281 * The default if this is not called is HIGHLY_RESTRICTIVE.
285 public static final int RESTRICTION_LEVEL = 16;
288 * Check that an identifer contains only characters from a single script (plus chars from the common and inherited
289 * scripts.) Applies to checks of a single identifier check only.
292 * @deprecated Use RESTRICTION_LEVEL
294 public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
297 * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
298 * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
299 * test the input string as a whole for conformance to any particular syntax for identifiers.
303 public static final int INVISIBLE = 32;
306 * Check that an identifier contains only characters from a specified set of acceptable characters. See
307 * Builder.setAllowedChars() and Builder.setAllowedLocales().
311 public static final int CHAR_LIMIT = 64;
314 * Check that an identifier does not mix numbers.
318 public static final int MIXED_NUMBERS = 128;
321 * Enable all spoof checks.
325 public static final int ALL_CHECKS = 0xFFFFFFFF;
328 // Magic number for sanity checking spoof binary resource data.
329 static final int MAGIC = 0x3845fdef;
332 * private constructor: a SpoofChecker has to be built by the builder
334 private SpoofChecker() {
338 * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
339 * checking options on the builder, then call the build() function to create a SpoofChecker instance.
343 public static class Builder {
344 int fChecks; // Bit vector of checks to perform.
345 SpoofData fSpoofData;
346 final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
347 // for this Spoof Checker. Defaults to all chars.
348 final Set<ULocale> fAllowedLocales = new LinkedHashSet<ULocale>(); // The list of allowed locales.
349 private RestrictionLevel fRestrictionLevel;
352 * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
353 * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
354 * to the default checking behavior.
359 fChecks = ALL_CHECKS;
361 fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
365 * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
368 * The existing checker.
371 public Builder(SpoofChecker src) {
372 fChecks = src.fChecks;
373 fSpoofData = src.fSpoofData; // For the data, we will either use the source data
374 // as-is, or drop the builder's reference to it
375 // and generate new data, depending on what our
376 // caller does with the builder.
377 fAllowedCharsSet.set(src.fAllowedCharsSet);
378 fAllowedLocales.addAll(src.fAllowedLocales);
379 fRestrictionLevel = src.fRestrictionLevel;
383 * Create a SpoofChecker with current configuration.
385 * @return SpoofChecker
388 public SpoofChecker build() {
389 if (fSpoofData == null) { // read binary file
390 fSpoofData = SpoofData.getDefault();
393 // Copy all state from the builder to the new SpoofChecker.
394 // Make sure that everything is either cloned or copied, so
395 // that subsequent re-use of the builder won't modify the built
398 // One exception to this: the SpoofData is just assigned.
399 // If the builder subsequently needs to modify fSpoofData
400 // it will create a new SpoofData object first.
403 SpoofChecker result = new SpoofChecker();
404 result.fChecks = this.fChecks;
405 result.fSpoofData = this.fSpoofData;
406 result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
407 result.fAllowedCharsSet.freeze();
408 result.fAllowedLocales = new HashSet<ULocale>(this.fAllowedLocales);
409 result.fRestrictionLevel = this.fRestrictionLevel;
414 * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data
415 * files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source
416 * data is as described in UAX 39 for these files, and the content of these files is acceptable input.
419 * the Reader of confusable characters definitions, as found in file confusables.txt from
421 * @param confusablesWholeScript
422 * the Reader of whole script confusables definitions, as found in the file
423 * xonfusablesWholeScript.txt from unicode.org.
424 * @throws ParseException
425 * To report syntax errors in the input.
428 public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException,
429 java.io.IOException {
431 // Compile the binary data from the source (text) format.
432 // Drop the builder's reference to any pre-existing data, which may
433 // be in use in an already-built checker.
435 fSpoofData = new SpoofData();
436 ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
437 WSConfusableDataBuilder.buildWSConfusableData(confusablesWholeScript, fSpoofData);
442 * Specify the set of checks that will be performed by the check functions of this Spoof Checker.
445 * The set of checks that this spoof checker will perform. The value is an 'or' of the desired
450 public Builder setChecks(int checks) {
451 // Verify that the requested checks are all ones (bits) that
452 // are acceptable, known values.
453 if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
454 throw new IllegalArgumentException("Bad Spoof Checks value.");
456 this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
461 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
462 * associated with the specified locales. Any previously specified list of locales is replaced by the new
465 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
466 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
467 * Unicode Script categories will be permitted.
469 * Supplying an empty string removes all restrictions; characters from any script will be allowed.
471 * The CHAR_LIMIT test is automatically enabled for this SpoofChecker when calling this function with a
472 * non-empty list of locales.
474 * The Unicode Set of characters that will be allowed is accessible via the getAllowedChars() function.
475 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
477 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
478 * setAllowedLocales() by fetching the resulting set with getAllowedChars(), manipulating it with the Unicode
479 * Set API, then resetting the spoof detectors limits with setAllowedChars()
482 * A Set of ULocales, from which the language and associated script are extracted. If the locales Set
483 * is null, no restrictions will be placed on the allowed characters.
488 public Builder setAllowedLocales(Set<ULocale> locales) {
489 fAllowedCharsSet.clear();
491 for (ULocale locale : locales) {
492 // Add the script chars for this locale to the accumulating set
494 addScriptChars(locale, fAllowedCharsSet);
497 // If our caller provided an empty list of locales, we disable the
498 // allowed characters checking
499 fAllowedLocales.clear();
500 if (locales.size() == 0) {
501 fAllowedCharsSet.add(0, 0x10ffff);
502 fChecks &= ~CHAR_LIMIT;
506 // Add all common and inherited characters to the set of allowed
508 UnicodeSet tempSet = new UnicodeSet();
509 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
510 fAllowedCharsSet.addAll(tempSet);
511 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
512 fAllowedCharsSet.addAll(tempSet);
514 // Store the updated spoof checker state.
515 fAllowedLocales.clear();
516 fAllowedLocales.addAll(locales);
517 fChecks |= CHAR_LIMIT;
521 // Add (union) to the UnicodeSet all of the characters for the scripts
522 // used for the specified locale. Part of the implementation of
523 // setAllowedLocales.
524 private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
525 int scripts[] = UScript.getCode(locale);
526 UnicodeSet tmpSet = new UnicodeSet();
528 for (i = 0; i < scripts.length; i++) {
529 tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
530 allowedChars.addAll(tmpSet);
535 * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
536 * is is replaced by the new settings. This includes limits on characters that were set with the
537 * setAllowedLocales() function. Note that the RESTRICTED set is useful;
539 * The CHAR_LIMIT test is automatically enabled for this SpoofChecker by this function.
542 * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
543 * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
544 * this function. Note that this clears the allowedLocales set.
548 public Builder setAllowedChars(UnicodeSet chars) {
549 fAllowedCharsSet.set(chars);
550 fAllowedLocales.clear();
551 fChecks |= CHAR_LIMIT;
557 * Set the loosest restriction level allowed. The default if this is not called is HIGHLY_RESTRICTIVE.
558 * This method also sets RESTRICTION_LEVEL.
559 * @param restrictionLevel The loosest restriction level allowed.
563 public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
564 fRestrictionLevel = restrictionLevel;
565 fChecks |= RESTRICTION_LEVEL;
569 // Structure for the Whole Script Confusable Data
570 // See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
571 // Whole Script confusable data
573 // The data provides mappings from code points to a set of scripts
574 // that contain characters that might be confused with the code point.
575 // There are two mappings, one for lower case only, and one for characters
578 // The actual data consists of a utrie2 to map from a code point to an offset,
579 // and an array of UScriptSets (essentially bit maps) that is indexed
580 // by the offsets obtained from the Trie.
585 * Internal functions for compiling Whole Script confusable source data into its binary (runtime) form. The
586 * binary data format is described in uspoof_impl.h
588 private static class WSConfusableDataBuilder {
590 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
592 // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
593 // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
595 // | | | |---- Which table, Any Case or Lower Case (A or L)
596 // | | |----------Target script. We need this.
597 // | |----------------Src script. Should match the script of the source
598 // | code points. Beyond checking that, we don't keep it.
599 // |--------------------------------Source code points or range.
601 // The expression will match _all_ lines, including erroneous lines.
602 // The result of the parse is returned via the contents of the (match) groups.
603 static String parseExp =
604 "(?m)" + // Multi-line mode
605 "^([ \\t]*(?:#.*?)?)$" + // A blank or comment line. Matches Group 1.
607 "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" + // Code point range. Groups 2 and 3.
608 "\\s*([A-Za-z]+)\\s*;" + // The source script. Group 4.
609 "\\s*([A-Za-z]+)\\s*;" + // The target script. Group 5.
610 "\\s*(?:(A)|(L))" + // The table A or L. Group 6 or 7
611 "[ \\t]*(?:#.*?)?" + // Trailing commment
613 "^(.*?)$"; // An error line. Group 8.
614 // Any line not matching the preceding
615 // parts of the expression will match
616 // this, and thus be flagged as an error
619 static void readWholeFileToString(Reader reader, StringBuffer buffer) throws java.io.IOException {
620 // Convert the user input data from UTF-8 to char (UTF-16)
621 LineNumberReader lnr = new LineNumberReader(reader);
623 String line = lnr.readLine();
632 // Build the Whole Script Confusable data
634 static void buildWSConfusableData(Reader confusablesWS, SpoofData dest)
635 throws ParseException, java.io.IOException {
636 Pattern parseRegexp = null;
637 StringBuffer input = new StringBuffer();
640 ArrayList<BuilderScriptSet> scriptSets = null;
641 int rtScriptSetsCount = 2;
643 Trie2Writable anyCaseTrie = new Trie2Writable(0, 0);
644 Trie2Writable lowerCaseTrie = new Trie2Writable(0, 0);
646 // The scriptSets vector provides a mapping from TRIE values to the set
649 // Reserved TRIE values:
650 // 0: Code point has no whole script confusables.
651 // 1: Code point is of script Common or Inherited.
653 // These code points do not participate in whole script confusable detection.
654 // (This is logically equivalent to saying that they contain confusables
657 // Because Trie values are indexes into the ScriptSets vector, pre-fill
658 // vector positions 0 and 1 to avoid conflicts with the reserved values.
660 scriptSets = new ArrayList<BuilderScriptSet>();
661 scriptSets.add(null);
662 scriptSets.add(null);
664 readWholeFileToString(confusablesWS, input);
666 parseRegexp = Pattern.compile(parseExp);
668 // Zap any Byte Order Mark at the start of input. Changing it to a space
670 // given the syntax of the input.
671 if (input.charAt(0) == 0xfeff) {
672 input.setCharAt(0, (char) 0x20);
675 // Parse the input, one line per iteration of this loop.
676 Matcher matcher = parseRegexp.matcher(input);
677 while (matcher.find()) {
679 if (matcher.start(1) >= 0) {
680 // this was a blank or comment line.
683 if (matcher.start(8) >= 0) {
684 // input file syntax error.
685 throw new ParseException("ConfusablesWholeScript, line " + lineNum + ": Unrecognized input: "
686 + matcher.group(), matcher.start());
689 // Pick up the start and optional range end code points from the
691 int startCodePoint = Integer.parseInt(matcher.group(2), 16);
692 if (startCodePoint > 0x10ffff) {
693 throw new ParseException("ConfusablesWholeScript, line " + lineNum
694 + ": out of range code point: " + matcher.group(2), matcher.start(2));
696 int endCodePoint = startCodePoint;
697 if (matcher.start(3) >= 0) {
698 endCodePoint = Integer.parseInt(matcher.group(3), 16);
700 if (endCodePoint > 0x10ffff) {
701 throw new ParseException("ConfusablesWholeScript, line " + lineNum
702 + ": out of range code point: " + matcher.group(3), matcher.start(3));
705 // Extract the two script names from the source line.
706 String srcScriptName = matcher.group(4);
707 String targScriptName = matcher.group(5);
708 int srcScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, srcScriptName);
709 int targScript = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, targScriptName);
710 if (srcScript == UScript.INVALID_CODE) {
711 throw new ParseException("ConfusablesWholeScript, line " + lineNum
712 + ": Invalid script code t: " + matcher.group(4), matcher.start(4));
714 if (targScript == UScript.INVALID_CODE) {
715 throw new ParseException("ConfusablesWholeScript, line " + lineNum
716 + ": Invalid script code t: " + matcher.group(5), matcher.start(5));
719 // select the table - (A) any case or (L) lower case only
720 Trie2Writable table = anyCaseTrie;
721 if (matcher.start(7) >= 0) {
722 table = lowerCaseTrie;
725 // Build the set of scripts containing confusable characters for
726 // the code point(s) specified in this input line.
727 // Sanity check that the script of the source code point is the same
728 // as the source script indicated in the input file. Failure of this
729 // check is an error in the input file.
731 // Include the source script in the set (needed for Mixed Script
732 // Confusable detection).
735 for (cp = startCodePoint; cp <= endCodePoint; cp++) {
736 int setIndex = table.get(cp);
737 BuilderScriptSet bsset = null;
739 assert (setIndex < scriptSets.size());
740 bsset = scriptSets.get(setIndex);
742 bsset = new BuilderScriptSet();
743 bsset.codePoint = cp;
745 bsset.sset = new ScriptSet();
746 setIndex = scriptSets.size();
747 bsset.index = setIndex;
749 scriptSets.add(bsset);
750 table.set(cp, setIndex);
752 bsset.sset.Union(targScript);
753 bsset.sset.Union(srcScript);
755 int cpScript = UScript.getScript(cp);
756 if (cpScript != srcScript) {
757 // status = U_INVALID_FORMAT_ERROR;
758 throw new ParseException("ConfusablesWholeScript, line " + lineNum
759 + ": Mismatch between source script and code point " + Integer.toString(cp, 16),
765 // Eliminate duplicate script sets. At this point we have a separate
766 // script set for every code point that had data in the input file.
768 // We eliminate underlying ScriptSet objects, not the BuildScriptSets
771 // printf("Number of scriptSets: %d\n", scriptSets.size());
772 //int duplicateCount = 0;
773 rtScriptSetsCount = 2;
774 for (int outeri = 2; outeri < scriptSets.size(); outeri++) {
775 BuilderScriptSet outerSet = scriptSets.get(outeri);
776 if (outerSet.index != outeri) {
777 // This set was already identified as a duplicate.
778 // It will not be allocated a position in the runtime array
782 outerSet.rindex = rtScriptSetsCount++;
783 for (int inneri = outeri + 1; inneri < scriptSets.size(); inneri++) {
784 BuilderScriptSet innerSet = scriptSets.get(inneri);
785 if (outerSet.sset.equals(innerSet.sset) && outerSet.sset != innerSet.sset) {
786 innerSet.sset = outerSet.sset;
787 innerSet.index = outeri;
788 innerSet.rindex = outerSet.rindex;
791 // But this doesn't get all. We need to fix the TRIE.
794 // printf("Number of distinct script sets: %d\n",
795 // rtScriptSetsCount);
797 // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
798 // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
799 // are unused, which is why the loop index starts at 2.)
800 for (int i = 2; i < scriptSets.size(); i++) {
801 BuilderScriptSet bSet = scriptSets.get(i);
802 if (bSet.rindex != i) {
803 bSet.trie.set(bSet.codePoint, bSet.rindex);
807 // For code points with script==Common or script==Inherited,
808 // Set the reserved value of 1 into both Tries. These characters do not participate
809 // in Whole Script Confusable detection; this reserved value is the means
810 // by which they are detected.
811 UnicodeSet ignoreSet = new UnicodeSet();
812 ignoreSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
813 UnicodeSet inheritedSet = new UnicodeSet();
814 inheritedSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
815 ignoreSet.addAll(inheritedSet);
816 for (int rn = 0; rn < ignoreSet.getRangeCount(); rn++) {
817 int rangeStart = ignoreSet.getRangeStart(rn);
818 int rangeEnd = ignoreSet.getRangeEnd(rn);
819 anyCaseTrie.setRange(rangeStart, rangeEnd, 1, true);
820 lowerCaseTrie.setRange(rangeStart, rangeEnd, 1, true);
823 // Put the compiled data to the destination SpoofData
824 dest.fAnyCaseTrie = anyCaseTrie.toTrie2_16();
825 dest.fLowerCaseTrie = lowerCaseTrie.toTrie2_16();
826 dest.fScriptSets = new ScriptSet[rtScriptSetsCount];
827 dest.fScriptSets[0] = new ScriptSet();
828 dest.fScriptSets[1] = new ScriptSet();
831 for (int i = 2; i < scriptSets.size(); i++) {
832 BuilderScriptSet bSet = scriptSets.get(i);
833 if (bSet.rindex < rindex) {
834 // We have already put this script set to the output data.
837 assert (rindex == bSet.rindex);
838 dest.fScriptSets[rindex] = bSet.sset;
843 // class BuilderScriptSet. Represents the set of scripts (Script Codes)
844 // containing characters that are confusable with one specific
846 static class BuilderScriptSet {
847 int codePoint; // The source code point.
848 Trie2Writable trie; // Any-case or Lower-case Trie.
849 // These Trie tables are the final result of the
850 // build. This flag indicates which of the two
851 // this set of data is for.
853 ScriptSet sset; // The set of scripts itself.
855 int index; // Index of this set in the Build Time vector
858 int rindex; // Index of this set in the final (runtime)
861 // its underlying sset.
875 * *****************************************************************************
876 * Internal classes for compililing confusable data into its binary (runtime) form.
877 * *****************************************************************************
879 // ---------------------------------------------------------------------
881 // buildConfusableData Compile the source confusable data, as defined by
882 // the Unicode data file confusables.txt, into the binary
883 // structures used by the confusable detector.
885 // The binary structures are described in uspoof_impl.h
887 // 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
888 // tables. Each maps from a int to a String.
890 // 2. Sort all of the strings encountered by length, since they will need to
891 // be stored in that order in the final string table.
893 // 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
894 // list because that will be the ordering of our runtime table.
896 // 4. Generate the run time string table. This is generated before the key & value
897 // tables because we need the string indexes when building those tables.
899 // 5. Build the run-time key and value tables. These are parallel tables, and
900 // are built at the same time
902 // class ConfusabledataBuilder
903 // An instance of this class exists while the confusable data is being built from source.
904 // It encapsulates the intermediate data structures that are used for building.
905 // It exports one static function, to do a confusable data build.
906 private static class ConfusabledataBuilder {
908 private Hashtable<Integer, SPUString> fSLTable;
909 private Hashtable<Integer, SPUString> fSATable;
910 private Hashtable<Integer, SPUString> fMLTable;
911 private Hashtable<Integer, SPUString> fMATable;
912 private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
913 // four mapping tables.
915 // The compiled data is first assembled into the following four collections,
916 // then output to the builder's SpoofData object.
917 private StringBuffer fStringTable;
918 private ArrayList<Integer> fKeyVec;
919 private ArrayList<Integer> fValueVec;
920 private ArrayList<Integer> fStringLengthsTable;
921 private SPUStringPool stringPool;
922 private Pattern fParseLine;
923 private Pattern fParseHexNum;
924 private int fLineNum;
926 ConfusabledataBuilder() {
927 fSLTable = new Hashtable<Integer, SPUString>();
928 fSATable = new Hashtable<Integer, SPUString>();
929 fMLTable = new Hashtable<Integer, SPUString>();
930 fMATable = new Hashtable<Integer, SPUString>();
931 fKeySet = new UnicodeSet();
932 fKeyVec = new ArrayList<Integer>();
933 fValueVec = new ArrayList<Integer>();
934 stringPool = new SPUStringPool();
937 void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
938 StringBuffer fInput = new StringBuffer();
939 WSConfusableDataBuilder.readWholeFileToString(confusables, fInput);
941 // Regular Expression to parse a line from Confusables.txt. The expression will match
942 // any line. What was matched is determined by examining which capture groups have a match.
943 // Capture Group 1: the source char
944 // Capture Group 2: the replacement chars
945 // Capture Group 3-6 the table type, SL, SA, ML, or MA
946 // Capture Group 7: A blank or comment only line.
947 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
948 // Example Line from the confusables.txt source file:
949 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
950 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
951 "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
952 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
953 "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
954 "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
955 "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
956 "|^(.*?)$"); // OR match any line, which catches illegal lines.
958 // Regular expression for parsing a hex number out of a space-separated list of them.
959 // Capture group 1 gets the number, with spaces removed.
960 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
962 // Zap any Byte Order Mark at the start of input. Changing it to a space
963 // is benign given the syntax of the input.
964 if (fInput.charAt(0) == 0xfeff) {
965 fInput.setCharAt(0, (char) 0x20);
968 // Parse the input, one line per iteration of this loop.
969 Matcher matcher = fParseLine.matcher(fInput);
970 while (matcher.find()) {
972 if (matcher.start(7) >= 0) {
973 // this was a blank or comment line.
976 if (matcher.start(8) >= 0) {
977 // input file syntax error.
978 // status = U_PARSE_ERROR;
979 throw new ParseException("Confusables, line " + fLineNum + ": Unrecognized Line: "
980 + matcher.group(8), matcher.start(8));
983 // We have a good input line. Extract the key character and mapping
985 // put them into the appropriate mapping table.
986 int keyChar = Integer.parseInt(matcher.group(1), 16);
987 if (keyChar > 0x10ffff) {
988 throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: "
989 + matcher.group(1), matcher.start(1));
991 Matcher m = fParseHexNum.matcher(matcher.group(2));
993 StringBuilder mapString = new StringBuilder();
995 int c = Integer.parseInt(m.group(1), 16);
996 if (keyChar > 0x10ffff) {
997 throw new ParseException("Confusables, line " + fLineNum + ": Bad code point: "
998 + Integer.toString(c, 16), matcher.start(2));
1000 mapString.appendCodePoint(c);
1002 assert (mapString.length() >= 1);
1004 // Put the map (value) string into the string pool
1005 // This a little like a Java intern() - any duplicates will be
1007 SPUString smapString = stringPool.addString(mapString.toString());
1009 // Add the char . string mapping to the appropriate table.
1010 Hashtable<Integer, SPUString> table = matcher.start(3) >= 0 ? fSLTable
1011 : matcher.start(4) >= 0 ? fSATable : matcher.start(5) >= 0 ? fMLTable
1012 : matcher.start(6) >= 0 ? fMATable : null;
1013 assert (table != null);
1014 table.put(keyChar, smapString);
1015 fKeySet.add(keyChar);
1018 // Input data is now all parsed and collected.
1019 // Now create the run-time binary form of the data.
1021 // This is done in two steps. First the data is assembled into vectors and strings,
1022 // for ease of construction, then the contents of these collections are copied
1023 // into the actual SpoofData object.
1025 // Build up the string array, and record the index of each string therein
1026 // in the (build time only) string pool.
1027 // Strings of length one are not entered into the strings array.
1028 // At the same time, build up the string lengths table, which records the
1029 // position in the string table of the first string of each length >= 4.
1030 // (Strings in the table are sorted by length)
1033 fStringTable = new StringBuffer();
1034 fStringLengthsTable = new ArrayList<Integer>();
1035 int previousStringLength = 0;
1036 int previousStringIndex = 0;
1037 int poolSize = stringPool.size();
1039 for (i = 0; i < poolSize; i++) {
1040 SPUString s = stringPool.getByIndex(i);
1041 int strLen = s.fStr.length();
1042 int strIndex = fStringTable.length();
1043 assert (strLen >= previousStringLength);
1045 // strings of length one do not get an entry in the string table.
1046 // Keep the single string character itself here, which is the same
1047 // convention that is used in the final run-time string table index.
1048 s.fStrTableIndex = s.fStr.charAt(0);
1050 if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
1051 fStringLengthsTable.add(previousStringIndex);
1052 fStringLengthsTable.add(previousStringLength);
1054 s.fStrTableIndex = strIndex;
1055 fStringTable.append(s.fStr);
1057 previousStringLength = strLen;
1058 previousStringIndex = strIndex;
1060 // Make the final entry to the string lengths table.
1061 // (it holds an entry for the _last_ string of each length, so adding
1063 // final one doesn't happen in the main loop because no longer string
1064 // was encountered.)
1065 if (previousStringLength >= 4) {
1066 fStringLengthsTable.add(previousStringIndex);
1067 fStringLengthsTable.add(previousStringLength);
1070 // Construct the compile-time Key and Value tables
1072 // For each key code point, check which mapping tables it applies to,
1073 // and create the final data for the key & value structures.
1075 // The four logical mapping tables are conflated into one combined
1077 // If multiple logical tables have the same mapping for some key, they
1078 // share a single entry in the combined table.
1079 // If more than one mapping exists for the same key code point, multiple
1080 // entries will be created in the table
1082 for (String keyCharStr: fKeySet) {
1083 int keyChar = keyCharStr.codePointAt(0);
1084 addKeyEntry(keyChar, fSLTable, SpoofChecker.SL_TABLE_FLAG);
1085 addKeyEntry(keyChar, fSATable, SpoofChecker.SA_TABLE_FLAG);
1086 addKeyEntry(keyChar, fMLTable, SpoofChecker.ML_TABLE_FLAG);
1087 addKeyEntry(keyChar, fMATable, SpoofChecker.MA_TABLE_FLAG);
1090 // Put the assembled data into the destination SpoofData object.
1093 // While copying the keys to the output array,
1094 // also sanity check that the keys are sorted.
1096 int numKeys = fKeyVec.size();
1097 dest.fCFUKeys = new int[numKeys];
1098 int previousKey = 0;
1099 for (i=0; i<numKeys; i++) {
1100 int key = fKeyVec.get(i);
1101 assert ((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
1102 assert ((key & 0xff000000) != 0);
1103 dest.fCFUKeys[i] = key;
1107 // The Value Table, parallels the key table
1108 int numValues = fValueVec.size();
1109 assert (numKeys == numValues);
1110 dest.fCFUValues = new short[numValues];
1112 for (int value:fValueVec) {
1113 assert (value < 0xffff);
1114 dest.fCFUValues[i++] = (short)value;
1117 // The Strings Table.
1119 dest.fCFUStrings = fStringTable.toString();
1122 // The String Lengths Table.
1124 // While copying into the runtime array do some sanity checks on the values
1125 // Each complete entry contains two fields, an index and an offset.
1126 // Lengths should increase with each entry.
1127 // Offsets should be less than the size of the string table.
1129 int lengthTableLength = fStringLengthsTable.size();
1130 int previousLength = 0;
1132 // Note: StringLengthsSize in the raw data is the number of complete entries,
1133 // each consisting of a pair of 16 bit values, hence the divide by 2.
1135 int stringLengthsSize = lengthTableLength / 2;
1136 dest.fCFUStringLengths = new SpoofData.SpoofStringLengthsElement[stringLengthsSize];
1137 for (i = 0; i < stringLengthsSize; i += 1) {
1138 int offset = fStringLengthsTable.get(i*2);
1139 int length = fStringLengthsTable.get(i*2 + 1);
1140 assert (offset < dest.fCFUStrings.length());
1141 assert (length < 40);
1142 assert (length > previousLength);
1143 dest.fCFUStringLengths[i] = new SpoofData.SpoofStringLengthsElement();
1144 dest.fCFUStringLengths[i].fLastString = offset;
1145 dest.fCFUStringLengths[i].fStrLength = length;
1146 previousLength = length;
1150 // Add an entry to the key and value tables being built
1151 // input: data from SLTable, MATable, etc.
1152 // outut: entry added to fKeyVec and fValueVec
1153 // addKeyEntry Construction of the confusable Key and Mapping Values tables.
1154 // This is an intermediate point in the building process.
1155 // We already have the mappings in the hash tables fSLTable, etc.
1156 // This function builds corresponding run-time style table entries into
1157 // fKeyVec and fValueVec
1158 void addKeyEntry(int keyChar, // The key character
1159 Hashtable<Integer, SPUString> table, // The table, one of SATable,
1161 int tableFlag) { // One of SA_TABLE_FLAG, etc.
1162 SPUString targetMapping = table.get(keyChar);
1163 if (targetMapping == null) {
1164 // No mapping for this key character.
1165 // (This function is called for all four tables for each key char
1167 // is seen anywhere, so this no entry cases are very much expected.)
1171 // Check whether there is already an entry with the correct mapping.
1172 // If so, simply set the flag in the keyTable saying that the existing
1174 // applies to the table that we're doing now.
1175 boolean keyHasMultipleValues = false;
1177 for (i = fKeyVec.size() - 1; i >= 0; i--) {
1178 int key = fKeyVec.get(i);
1179 if ((key & 0x0ffffff) != keyChar) {
1180 // We have now checked all existing key entries for this key
1182 // without finding one with the same mapping.
1185 String mapping = getMapping(i);
1186 if (mapping.equals(targetMapping.fStr)) {
1187 // The run time entry we are currently testing has the correct
1189 // Set the flag in it indicating that it applies to the new
1192 fKeyVec.set(i, key);
1195 keyHasMultipleValues = true;
1198 // Need to add a new entry to the binary data being built for this
1200 // Includes adding entries to both the key table and the parallel values
1202 int newKey = keyChar | tableFlag;
1203 if (keyHasMultipleValues) {
1204 newKey |= SpoofChecker.KEY_MULTIPLE_VALUES;
1206 int adjustedMappingLength = targetMapping.fStr.length() - 1;
1207 if (adjustedMappingLength > 3) {
1208 adjustedMappingLength = 3;
1210 newKey |= adjustedMappingLength << SpoofChecker.KEY_LENGTH_SHIFT;
1212 int newData = targetMapping.fStrTableIndex;
1214 fKeyVec.add(newKey);
1215 fValueVec.add(newData);
1217 // If the preceding key entry is for the same key character (but with a
1218 // different mapping)
1219 // set the multiple-values flag on it.
1220 if (keyHasMultipleValues) {
1221 int previousKeyIndex = fKeyVec.size() - 2;
1222 int previousKey = fKeyVec.get(previousKeyIndex);
1223 previousKey |= SpoofChecker.KEY_MULTIPLE_VALUES;
1224 fKeyVec.set(previousKeyIndex, previousKey);
1228 // From an index into fKeyVec & fValueVec
1229 // get a String with the corresponding mapping.
1230 String getMapping(int index) {
1231 int key = fKeyVec.get(index);
1232 int value = fValueVec.get(index);
1233 int length = SpoofChecker.getKeyLength(key);
1234 int lastIndexWithLen;
1237 char[] cs = { (char) value };
1238 return new String(cs);
1241 return fStringTable.substring(value, value + length + 1); // Note: +1 as optimization
1245 for (i = 0; i < fStringLengthsTable.size(); i += 2) {
1246 lastIndexWithLen = fStringLengthsTable.get(i);
1247 if (value <= lastIndexWithLen) {
1248 length = fStringLengthsTable.get(i + 1);
1252 assert (length >= 3);
1253 return fStringTable.substring(value, value + length);
1264 public static void buildConfusableData(Reader confusables, SpoofData dest) throws java.io.IOException,
1266 ConfusabledataBuilder builder = new ConfusabledataBuilder();
1267 builder.build(confusables, dest);
1271 * *****************************************************************************
1272 * Internal classes for compiling confusable data into its binary (runtime) form.
1273 * *****************************************************************************
1276 // Holds a string that is the result of one of the mappings defined
1277 // by the confusable mapping data (confusables.txt from Unicode.org)
1278 // Instances of SPUString exist during the compilation process only.
1280 private static class SPUString {
1281 String fStr; // The actual string.
1282 int fStrTableIndex; // Index into the final runtime data for this string.
1283 // (or, for length 1, the single string char itself,
1284 // there being no string table entry for it.)
1286 SPUString(String s) {
1292 // Comparison function for ordering strings in the string pool.
1293 // Compare by length first, then, within a group of the same length,
1294 // by code point order.
1296 private static class SPUStringComparator implements Comparator<SPUString> {
1297 public int compare(SPUString sL, SPUString sR) {
1298 int lenL = sL.fStr.length();
1299 int lenR = sR.fStr.length();
1302 } else if (lenL > lenR) {
1305 return sL.fStr.compareTo(sR.fStr);
1310 // String Pool A utility class for holding the strings that are the result of
1311 // the spoof mappings. These strings will utimately end up in the
1312 // run-time String Table.
1313 // This is sort of like a sorted set of strings, except that ICU's anemic
1314 // built-in collections don't support those, so it is implemented with a
1315 // combination of a uhash and a Vector.
1316 private static class SPUStringPool {
1317 public SPUStringPool() {
1318 fVec = new Vector<SPUString>();
1319 fHash = new Hashtable<String, SPUString>();
1326 // Get the n-th string in the collection.
1327 public SPUString getByIndex(int index) {
1328 SPUString retString = fVec.elementAt(index);
1332 // Add a string. Return the string from the table.
1333 // If the input parameter string is already in the table, delete the
1334 // input parameter and return the existing string.
1335 public SPUString addString(String src) {
1336 SPUString hashedString = fHash.get(src);
1337 if (hashedString == null) {
1338 hashedString = new SPUString(src);
1339 fHash.put(src, hashedString);
1340 fVec.addElement(hashedString);
1342 return hashedString;
1345 // Sort the contents; affects the ordering of getByIndex().
1346 public void sort() {
1347 Collections.sort(fVec, new SPUStringComparator());
1350 private Vector<SPUString> fVec; // Elements are SPUString *
1351 private Hashtable<String, SPUString> fHash; // Key: Value:
1358 * Get the Restriction Level that is being tested.
1360 * @return The restriction level
1363 public RestrictionLevel getRestrictionLevel() {
1364 return fRestrictionLevel;
1368 * Get the set of checks that this Spoof Checker has been configured to perform.
1370 * @return The set of checks that this spoof checker will perform.
1373 public int getChecks() {
1378 * Get a list of locales for the scripts that are acceptable in strings to be checked. If no limitations on scripts
1379 * have been specified, an empty set will be returned.
1381 * setAllowedChars() will reset the list of allowed locales to be empty.
1383 * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
1384 * the information other than languages from the originally specified locales may be omitted.
1386 * @return A set of locales corresponding to the acceptable scripts.
1390 public Set<ULocale> getAllowedLocales() {
1391 return fAllowedLocales;
1395 * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
1396 * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
1399 * The returned set will be frozen, meaning that it cannot be modified by the caller.
1401 * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
1404 public UnicodeSet getAllowedChars() {
1405 return fAllowedCharsSet;
1409 * A struct-like class to hold the results of a Spoof Check operation.
1410 * Tells which check(s) have failed.
1414 public static class CheckResult {
1416 * Indicate which of the spoof check(s) has failed. The value is a bitwise OR
1417 * of the constants for the tests in question, SINGLE_SCRIPT_CONFUSABLE,
1418 * MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, and so on.
1424 * The index of the first string position that failed a check.
1426 * @deprecated ICU 51. No longer supported. Always set to zero.
1428 public int position;
1430 * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null;
1434 public UnicodeSet numerics;
1436 * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
1440 public RestrictionLevel restrictionLevel;
1443 * Default constructor
1446 public CheckResult() {
1453 * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1454 * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1457 * A String to be checked for possible security issues.
1458 * @param checkResult
1459 * Output parameter, indicates which specific tests failed.
1460 * May be null if the information is not wanted.
1461 * @return True there any issue is found with the input string.
1464 public boolean failsChecks(String text, CheckResult checkResult) {
1465 int length = text.length();
1468 if (checkResult != null) {
1469 checkResult.position = 0;
1470 checkResult.numerics = null;
1471 checkResult.restrictionLevel = null;
1474 // Allocate an identifier info if needed.
1476 IdentifierInfo identifierInfo = null;
1477 if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) {
1478 identifierInfo = getIdentifierInfo().setIdentifier(text).setIdentifierProfile(fAllowedCharsSet);
1481 if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) {
1482 RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel();
1483 if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
1484 result |= RESTRICTION_LEVEL;
1486 if (checkResult != null) {
1487 checkResult.restrictionLevel = textRestrictionLevel;
1491 if (0 != ((this.fChecks) & MIXED_NUMBERS)) {
1492 UnicodeSet numerics = identifierInfo.getNumerics();
1493 if (numerics.size() > 1) {
1494 result |= MIXED_NUMBERS;
1496 if (checkResult != null) {
1497 checkResult.numerics = numerics;
1501 if (0 != (this.fChecks & CHAR_LIMIT)) {
1504 for (i = 0; i < length;) {
1505 // U16_NEXT(text, i, length, c);
1506 c = Character.codePointAt(text, i);
1507 i = Character.offsetByCodePoints(text, i, 1);
1508 if (!this.fAllowedCharsSet.contains(c)) {
1509 result |= CHAR_LIMIT;
1515 if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | INVISIBLE))) {
1516 // These are the checks that need to be done on NFD input
1517 String nfdText = nfdNormalizer.normalize(text);
1519 if (0 != (this.fChecks & INVISIBLE)) {
1521 // scan for more than one occurence of the same non-spacing mark
1522 // in a sequence of non-spacing marks.
1525 int firstNonspacingMark = 0;
1526 boolean haveMultipleMarks = false;
1527 UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
1528 // single combining sequence.
1529 for (i = 0; i < length;) {
1530 c = Character.codePointAt(nfdText, i);
1531 i = Character.offsetByCodePoints(nfdText, i, 1);
1532 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
1533 firstNonspacingMark = 0;
1534 if (haveMultipleMarks) {
1535 marksSeenSoFar.clear();
1536 haveMultipleMarks = false;
1540 if (firstNonspacingMark == 0) {
1541 firstNonspacingMark = c;
1544 if (!haveMultipleMarks) {
1545 marksSeenSoFar.add(firstNonspacingMark);
1546 haveMultipleMarks = true;
1548 if (marksSeenSoFar.contains(c)) {
1549 // report the error, and stop scanning.
1550 // No need to find more than the first failure.
1551 result |= INVISIBLE;
1554 marksSeenSoFar.add(c);
1558 if (0 != (this.fChecks & (WHOLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE))) {
1559 // The basic test is the same for both whole and mixed script confusables.
1560 // Compute the set of scripts that every input character has a confusable in.
1561 // For this computation an input character is always considered to be
1562 // confusable with itself in its own script.
1564 // If the number of such scripts is two or more, and the input consisted of
1565 // characters all from a single script, we have a whole script confusable.
1566 // (The two scripts will be the original script and the one that is confusable).
1568 // If the number of such scripts >= one, and the original input contained characters from
1569 // more than one script, we have a mixed script confusable. (We can transform
1570 // some of the characters, and end up with a visually similar string all in one script.)
1572 if (identifierInfo == null) {
1573 identifierInfo = getIdentifierInfo();
1574 identifierInfo.setIdentifier(text);
1576 int scriptCount = identifierInfo.getScriptCount();
1578 ScriptSet scripts = new ScriptSet();
1579 this.wholeScriptCheck(nfdText, scripts);
1580 int confusableScriptCount = scripts.countMembers();
1582 if ((0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 2 && scriptCount == 1) {
1583 result |= WHOLE_SCRIPT_CONFUSABLE;
1586 if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) && confusableScriptCount >= 1 && scriptCount > 1) {
1587 result |= MIXED_SCRIPT_CONFUSABLE;
1591 if (checkResult != null) {
1592 checkResult.checks = result;
1594 releaseIdentifierInfo(identifierInfo);
1595 return (0 != result);
1599 * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1600 * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1603 * A String to be checked for possible security issues.
1604 * @return True there any issue is found with the input string.
1607 public boolean failsChecks(String text) {
1608 return failsChecks(text, null);
1612 * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
1613 * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
1615 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1616 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1618 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1619 * folded for comparison and display to the user, do not select the ANY_CASE option.
1623 * The first of the two strings to be compared for confusability.
1625 * The second of the two strings to be compared for confusability.
1626 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1627 * found, as defined by spoof check test constants.
1630 public int areConfusable(String s1, String s2) {
1632 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
1633 // and for definitions of the types (single, whole, mixed-script) of confusables.
1635 // We only care about a few of the check flags. Ignore the others.
1636 // If no tests relavant to this function have been specified, signal an error.
1637 // TODO: is this really the right thing to do? It's probably an error on
1638 // the caller's part, but logically we would just return 0 (no error).
1639 if ((this.fChecks & (SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE)) == 0) {
1640 throw new IllegalArgumentException("No confusable checks are enabled.");
1642 int flagsForSkeleton = this.fChecks & ANY_CASE;
1645 IdentifierInfo identifierInfo = getIdentifierInfo();
1646 identifierInfo.setIdentifier(s1);
1647 int s1ScriptCount = identifierInfo.getScriptCount();
1648 identifierInfo.setIdentifier(s2);
1649 int s2ScriptCount = identifierInfo.getScriptCount();
1650 releaseIdentifierInfo(identifierInfo);
1652 if (0 != (this.fChecks & SINGLE_SCRIPT_CONFUSABLE)) {
1653 // Do the Single Script compare.
1654 if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
1655 flagsForSkeleton |= SINGLE_SCRIPT_CONFUSABLE;
1656 String s1Skeleton = getSkeleton(flagsForSkeleton, s1);
1657 String s2Skeleton = getSkeleton(flagsForSkeleton, s2);
1658 if (s1Skeleton.equals(s2Skeleton)) {
1659 result |= SINGLE_SCRIPT_CONFUSABLE;
1664 if (0 != (result & SINGLE_SCRIPT_CONFUSABLE)) {
1665 // If the two inputs are single script confusable they cannot also be
1666 // mixed or whole script confusable, according to the UAX39 definitions.
1667 // So we can skip those tests.
1671 // Two identifiers are whole script confusable if each is of a single script
1672 // and they are mixed script confusable.
1673 boolean possiblyWholeScriptConfusables = s1ScriptCount <= 1 && s2ScriptCount <= 1
1674 && (0 != (this.fChecks & WHOLE_SCRIPT_CONFUSABLE));
1676 // Mixed Script Check
1677 if ((0 != (this.fChecks & MIXED_SCRIPT_CONFUSABLE)) || possiblyWholeScriptConfusables) {
1678 // For getSkeleton(), resetting the SINGLE_SCRIPT_CONFUSABLE flag will get us
1679 // the mixed script table skeleton, which is what we want.
1680 // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
1681 flagsForSkeleton &= ~SINGLE_SCRIPT_CONFUSABLE;
1682 String s1Skeleton = getSkeleton(flagsForSkeleton, s1);
1683 String s2Skeleton = getSkeleton(flagsForSkeleton, s2);
1684 if (s1Skeleton.equals(s2Skeleton)) {
1685 result |= MIXED_SCRIPT_CONFUSABLE;
1686 if (possiblyWholeScriptConfusables) {
1687 result |= WHOLE_SCRIPT_CONFUSABLE;
1695 * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
1696 * confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
1698 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1699 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1702 * The type of skeleton, corresponding to which of the Unicode confusable data tables to use. The default
1703 * is Mixed-Script, Lowercase. Allowed options are SINGLE_SCRIPT_CONFUSABLE and ANY_CASE_CONFUSABLE. The
1704 * two flags may be ORed.
1706 * The input identifier whose skeleton will be genereated.
1707 * @return The output skeleton string.
1711 public String getSkeleton(int type, String id) {
1715 tableMask = ML_TABLE_FLAG;
1717 case SINGLE_SCRIPT_CONFUSABLE:
1718 tableMask = SL_TABLE_FLAG;
1721 tableMask = MA_TABLE_FLAG;
1723 case SINGLE_SCRIPT_CONFUSABLE | ANY_CASE:
1724 tableMask = SA_TABLE_FLAG;
1727 // *status = U_ILLEGAL_ARGUMENT_ERROR;
1728 throw new IllegalArgumentException("SpoofChecker.getSkeleton(), bad type value.");
1731 // Apply the skeleton mapping to the NFD normalized input string
1732 // Accumulate the skeleton, possibly unnormalized, in a String.
1734 String nfdId = nfdNormalizer.normalize(id);
1735 int normalizedLen = nfdId.length();
1736 StringBuilder skelSB = new StringBuilder();
1737 for (int inputIndex = 0; inputIndex < normalizedLen;) {
1738 int c = Character.codePointAt(nfdId, inputIndex);
1739 inputIndex += Character.charCount(c);
1740 this.confusableLookup(c, tableMask, skelSB);
1742 String skelStr = skelSB.toString();
1743 skelStr = nfdNormalizer.normalize(skelStr);
1749 * Equality function. Return true if the two SpoofChecker objects
1750 * incorporate the same confusable data and have enabled the same
1753 * @param other the SpoofChecker being compared with.
1754 * @return true if the two SpoofCheckers are equal.
1757 public boolean equals(Object other) {
1758 if (!(other instanceof SpoofChecker)) {return false; }
1759 SpoofChecker otherSC = (SpoofChecker)other;
1760 if (fSpoofData != otherSC.fSpoofData &&
1761 fSpoofData != null &&
1762 !fSpoofData.equals(otherSC.fSpoofData)) {
1765 if (fChecks != otherSC.fChecks) {return false; }
1766 if (fAllowedLocales != otherSC.fAllowedLocales &&
1767 fAllowedLocales != null &&
1768 !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
1771 if (fAllowedCharsSet != otherSC.fAllowedCharsSet &&
1772 fAllowedCharsSet != null &&
1773 !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
1776 if (fRestrictionLevel != otherSC.fRestrictionLevel) {
1784 * Append the confusable skeleton transform for a single code point to a StringBuilder.
1785 * The string to be appended will between 1 and 18 characters.
1787 * This is the heart of the confusable skeleton generation implementation.
1789 * @param tableMask bit flag specifying which confusable table to use. One of SL_TABLE_FLAG, MA_TABLE_FLAG, etc.
1791 private void confusableLookup(int inChar, int tableMask, StringBuilder dest) {
1792 // Binary search the spoof data key table for the inChar
1795 int limit = fSpoofData.fCFUKeys.length;
1797 boolean foundChar = false;
1798 // [low, limit), i.e low is inclusive, limit is exclusive
1800 int delta = (limit - low) / 2;
1802 midc = fSpoofData.fCFUKeys[mid] & 0x1fffff;
1803 if (inChar == midc) {
1806 } else if (inChar < midc) {
1807 limit = mid; // limit is exclusive
1809 // we have checked mid is not the char we looking for, the next char
1810 // we want to check is (mid + 1)
1811 low = mid + 1; // low is inclusive
1813 } while (low < limit);
1814 if (!foundChar) { // Char not found. It maps to itself.
1815 dest.appendCodePoint(inChar);
1819 boolean foundKey = false;
1820 int keyFlags = fSpoofData.fCFUKeys[mid] & 0xff000000;
1821 if ((keyFlags & tableMask) == 0) {
1822 // We found the right key char, but the entry doesn't pertain to the
1823 // table we need. See if there is an adjacent key that does
1824 if (0 != (keyFlags & SpoofChecker.KEY_MULTIPLE_VALUES)) {
1826 for (altMid = mid - 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid--) {
1827 keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000;
1828 if (0 != (keyFlags & tableMask)) {
1835 for (altMid = mid + 1; (fSpoofData.fCFUKeys[altMid] & 0x00ffffff) == inChar; altMid++) {
1836 keyFlags = fSpoofData.fCFUKeys[altMid] & 0xff000000;
1837 if (0 != (keyFlags & tableMask)) {
1846 // No key entry for this char & table.
1847 // The input char maps to itself.
1848 dest.appendCodePoint(inChar);
1853 int stringLen = getKeyLength(keyFlags) + 1;
1854 int keyTableIndex = mid;
1856 // Value is either a char (for strings of length 1) or
1857 // an index into the string table (for longer strings)
1858 short value = fSpoofData.fCFUValues[keyTableIndex];
1859 if (stringLen == 1) {
1860 dest.append((char) value);
1864 // String length of 4 from the above lookup is used for all strings of
1866 // For these, get the real length from the string lengths table,
1867 // which maps string table indexes to lengths.
1868 // All strings of the same length are stored contiguously in the string table.
1869 // 'value' from the lookup above is the starting index for the desired string.
1871 if (stringLen == 4) {
1872 boolean dataOK = false;
1873 for (SpoofData.SpoofStringLengthsElement el: fSpoofData.fCFUStringLengths) {
1874 if (el.fLastString >= value) {
1875 stringLen = el.fStrLength;
1883 dest.append(fSpoofData.fCFUStrings, value, value + stringLen);
1887 // Implementation for Whole Script tests.
1888 // Input text is already normalized to NFD
1889 // Return the set of scripts, each of which can represent something that is
1890 // confusable with the input text. The script of the input text
1891 // is included; input consisting of characters from a single script will
1892 // always produce a result consisting of a set containing that script.
1893 private void wholeScriptCheck(CharSequence text, ScriptSet result) {
1897 Trie2 table = (0 != (fChecks & ANY_CASE)) ? fSpoofData.fAnyCaseTrie : fSpoofData.fLowerCaseTrie;
1899 while (inputIdx < text.length()) {
1900 c = Character.codePointAt(text, inputIdx);
1901 inputIdx = Character.offsetByCodePoints(text, inputIdx, 1);
1902 int index = table.get(c);
1904 // No confusables in another script for this char.
1905 // TODO: we should change the data to have sets with just the single script
1906 // bit for the script of this char. Gets rid of this special case.
1907 // Until then, grab the script from the char and intersect it with the set.
1908 int cpScript = UScript.getScript(c);
1909 assert (cpScript > UScript.INHERITED);
1910 result.intersect(cpScript);
1911 } else if (index == 1) {
1912 // Script == Common or Inherited. Nothing to do.
1914 result.intersect(fSpoofData.fScriptSets[index]);
1919 // IdentifierInfo Cache. IdentifierInfo objects are somewhat expensive to create.
1920 // Maintain a one-element cache, which is sufficient to avoid repeatedly
1921 // creating new ones unless we get multi-thread concurrency collisions in spoof
1922 // check operations, which should be statistically uncommon.
1924 private IdentifierInfo fCachedIdentifierInfo = null; // Do not use this directly.
1926 private IdentifierInfo getIdentifierInfo() {
1927 IdentifierInfo returnIdInfo = null;
1928 synchronized (this) {
1929 returnIdInfo = fCachedIdentifierInfo;
1930 fCachedIdentifierInfo = null;
1932 if (returnIdInfo == null) {
1933 returnIdInfo = new IdentifierInfo();
1935 return returnIdInfo;
1939 private void releaseIdentifierInfo(IdentifierInfo idInfo) {
1940 if (idInfo != null) {
1941 synchronized (this) {
1942 if (fCachedIdentifierInfo == null) {
1943 fCachedIdentifierInfo = idInfo;
1950 private int fChecks; // Bit vector of checks to perform.
1951 private SpoofData fSpoofData;
1952 private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
1953 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
1954 private RestrictionLevel fRestrictionLevel;
1956 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
1959 // Confusable Mappings Data Structures
1961 // For the confusable data, we are essentially implementing a map,
1962 // key: a code point
1963 // value: a string. Most commonly one char in length, but can be more.
1965 // The keys are stored as a sorted array of 32 bit ints.
1966 // bits 0-23 a code point value
1968 // 24: 1 if entry applies to SL table
1969 // 25: 1 if entry applies to SA table
1970 // 26: 1 if entry applies to ML table
1971 // 27: 1 if entry applies to MA table
1972 // 28: 1 if there are multiple entries for this code point.
1973 // 29-30: length of value string, in UChars.
1974 // values are (1, 2, 3, other)
1975 // The key table is sorted in ascending code point order. (not on the
1976 // 32 bit int value, the flag bits do not participate in the sorting.)
1978 // Lookup is done by means of a binary search in the key table.
1980 // The corresponding values are kept in a parallel array of 16 bit ints.
1981 // If the value string is of length 1, it is literally in the value array.
1982 // For longer strings, the value array contains an index into the strings
1986 // The strings table contains all of the value strings (those of length two or greater)
1987 // concatentated together into one long char (UTF-16) array.
1989 // The array is arranged by length of the strings - all strings of the same length
1990 // are stored together. The sections are ordered by length of the strings -
1991 // all two char strings first, followed by all of the three Char strings, etc.
1993 // There is no nul character or other mark between adjacent strings.
1995 // String Lengths table
1996 // The length of strings from 1 to 3 is flagged in the key table.
1997 // For strings of length 4 or longer, the string length table provides a
1998 // mapping between an index into the string table and the corresponding length.
1999 // Strings of these lengths are rare, so lookup time is not an issue.
2000 // Each entry consists of
2001 // unsigned short index of the _last_ string with this length
2002 // unsigned short the length
2004 // Flag bits in the Key entries
2005 static final int SL_TABLE_FLAG = (1 << 24);
2006 static final int SA_TABLE_FLAG = (1 << 25);
2007 static final int ML_TABLE_FLAG = (1 << 26);
2008 static final int MA_TABLE_FLAG = (1 << 27);
2009 static final int KEY_MULTIPLE_VALUES = (1 << 28);
2010 static final int KEY_LENGTH_SHIFT = 29;
2012 static final int getKeyLength(int x) {
2013 return (((x) >> 29) & 3);
2017 // -------------------------------------------------------------------------------------
2021 // This class corresonds to the ICU SpoofCheck data.
2023 // The data can originate with the Binary ICU data that is generated in ICU4C,
2024 // or it can originate from source rules that are compiled in ICU4J.
2026 // This class does not include the set of checks to be performed, but only
2027 // data that is serialized into the ICU binary data.
2029 // Because Java cannot easily wrap binaray data like ICU4C, the binary data is
2030 // copied into Java structures that are convenient for use by the run time code.
2032 // ---------------------------------------------------------------------------------------
2033 private static class SpoofData {
2035 // The Confusable data, Java data structures for.
2038 SpoofStringLengthsElement[] fCFUStringLengths;
2041 // Whole Script Confusable Data
2043 Trie2 fLowerCaseTrie;
2044 ScriptSet[] fScriptSets;
2046 static class SpoofStringLengthsElement {
2047 int fLastString; // index in string table of last string with this length
2048 int fStrLength; // Length of strings
2049 public boolean equals(Object other) {
2050 if (!(other instanceof SpoofStringLengthsElement)) {
2053 SpoofStringLengthsElement otherEl = (SpoofStringLengthsElement)other;
2054 return fLastString == otherEl.fLastString &&
2055 fStrLength == otherEl.fStrLength;
2061 // getDefault() - Create a SpoofData instance that is built from
2062 // the data baked into the default ICU data.
2064 static SpoofData getDefault() {
2065 // TODO: Cache it. Lazy create, keep until cleanup.
2066 SpoofData This = null;
2068 InputStream is = com.ibm.icu.impl.ICUData.getRequiredStream(com.ibm.icu.impl.ICUResourceBundle.ICU_BUNDLE
2069 + "/confusables.cfu");
2070 This = new SpoofData(is);
2073 catch (IOException e) {
2074 // Return null in this case.
2079 // SpoofChecker Data constructor for use from data builder.
2080 // Initializes a new, empty data area that will be populated later.
2084 // Constructor for use when creating from prebuilt default data.
2085 // A InputStream is what the ICU internal data loading functions provide.
2086 SpoofData(InputStream is) throws java.io.IOException {
2087 // Seek past the ICU data header.
2088 // TODO: verify that the header looks good.
2089 DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
2091 assert (dis.markSupported());
2092 dis.mark(Integer.MAX_VALUE);
2096 public boolean equals(Object other) {
2097 if (!(other instanceof SpoofData)) {
2100 SpoofData otherData = (SpoofData)other;
2101 if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) return false;
2102 if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) return false;
2103 if (!Arrays.deepEquals(fCFUStringLengths, otherData.fCFUStringLengths)) return false;
2104 if (fCFUStrings != otherData.fCFUStrings &&
2105 fCFUStrings != null &&
2106 !fCFUStrings.equals(otherData.fCFUStrings)) return false;
2107 if (fAnyCaseTrie != otherData.fAnyCaseTrie &&
2108 fAnyCaseTrie != null &&
2109 !fAnyCaseTrie.equals(otherData.fAnyCaseTrie)) return false;
2110 if (fLowerCaseTrie != otherData.fLowerCaseTrie &&
2111 fLowerCaseTrie != null &&
2112 !fLowerCaseTrie.equals(otherData.fLowerCaseTrie)) return false;
2113 if (!Arrays.deepEquals(fScriptSets, otherData.fScriptSets)) return false;
2117 // Set the SpoofChecker data from pre-built binary data on a DataInputStream.
2118 // The binary data format is as described for ICU4C spoof data.
2120 void readData(DataInputStream dis) throws java.io.IOException {
2121 int magic = dis.readInt();
2122 if (magic != 0x3845fdef) {
2123 throw new IllegalArgumentException("Bad Spoof Check Data.");
2125 @SuppressWarnings("unused")
2126 int dataFormatVersion = dis.readInt();
2127 @SuppressWarnings("unused")
2128 int dataLength = dis.readInt();
2130 int CFUKeysOffset = dis.readInt();
2131 int CFUKeysSize = dis.readInt();
2133 int CFUValuesOffset = dis.readInt();
2134 int CFUValuesSize = dis.readInt();
2136 int CFUStringTableOffset = dis.readInt();
2137 int CFUStringTableSize = dis.readInt();
2139 int CFUStringLengthsOffset = dis.readInt();
2140 int CFUStringLengthsSize = dis.readInt();
2142 int anyCaseTrieOffset = dis.readInt();
2143 @SuppressWarnings("unused")
2144 int anyCaseTrieSize = dis.readInt();
2146 int lowerCaseTrieOffset = dis.readInt();
2147 @SuppressWarnings("unused")
2148 int lowerCaseTrieLength = dis.readInt();
2150 int scriptSetsOffset = dis.readInt();
2151 int scriptSetslength = dis.readInt();
2156 fCFUStringLengths = null;
2159 // We have now read the file header, and obtained the position for each
2160 // of the data items. Now read each in turn, first seeking the
2161 // input stream to the position of the data item.
2164 dis.skip(CFUKeysOffset);
2165 fCFUKeys = new int[CFUKeysSize];
2166 for (i = 0; i < CFUKeysSize; i++) {
2167 fCFUKeys[i] = dis.readInt();
2171 dis.skip(CFUValuesOffset);
2172 fCFUValues = new short[CFUValuesSize];
2173 for (i = 0; i < CFUValuesSize; i++) {
2174 fCFUValues[i] = dis.readShort();
2178 dis.skip(CFUStringTableOffset);
2179 StringBuffer CFUStringB = new StringBuffer();
2180 for (i = 0; i < CFUStringTableSize; i++) {
2181 CFUStringB.append(dis.readChar());
2183 fCFUStrings = CFUStringB.toString();
2186 dis.skip(CFUStringLengthsOffset);
2187 fCFUStringLengths = new SpoofStringLengthsElement[CFUStringLengthsSize];
2188 for (i = 0; i < CFUStringLengthsSize; i++) {
2189 fCFUStringLengths[i] = new SpoofStringLengthsElement();
2190 fCFUStringLengths[i].fLastString = dis.readShort();
2191 fCFUStringLengths[i].fStrLength = dis.readShort();
2195 dis.skip(anyCaseTrieOffset);
2196 fAnyCaseTrie = Trie2.createFromSerialized(dis);
2199 dis.skip(lowerCaseTrieOffset);
2200 fLowerCaseTrie = Trie2.createFromSerialized(dis);
2203 dis.skip(scriptSetsOffset);
2204 fScriptSets = new ScriptSet[scriptSetslength];
2205 for (i = 0; i < scriptSetslength; i++) {
2206 fScriptSets[i] = new ScriptSet(dis);
2212 // -------------------------------------------------------------------------------
2214 // ScriptSet - Script code bit sets. Used with the whole script confusable data.
2215 // Used both at data build and at run time.
2216 // Could almost be a Java BitSet, except that the input and output would
2219 // -------------------------------------------------------------------------------
2220 static class ScriptSet {
2221 public ScriptSet() {
2224 public ScriptSet(DataInputStream dis) throws java.io.IOException {
2225 for (int j = 0; j < bits.length; j++) {
2226 bits[j] = dis.readInt();
2230 public void output(DataOutputStream os) throws java.io.IOException {
2231 for (int i = 0; i < bits.length; i++) {
2232 os.writeInt(bits[i]);
2236 public boolean equals(Object other) {
2237 if (!(other instanceof ScriptSet)) {
2240 ScriptSet otherSet = (ScriptSet)other;
2241 return Arrays.equals(bits, otherSet.bits);
2244 public void Union(int script) {
2245 int index = script / 32;
2246 int bit = 1 << (script & 31);
2247 assert (index < bits.length * 4 * 4);
2251 @SuppressWarnings("unused")
2252 public void Union(ScriptSet other) {
2253 for (int i = 0; i < bits.length; i++) {
2254 bits[i] |= other.bits[i];
2258 public void intersect(ScriptSet other) {
2259 for (int i = 0; i < bits.length; i++) {
2260 bits[i] &= other.bits[i];
2264 public void intersect(int script) {
2265 int index = script / 32;
2266 int bit = 1 << (script & 31);
2267 assert (index < bits.length * 4 * 4);
2269 for (i = 0; i < index; i++) {
2273 for (i = index + 1; i < bits.length; i++) {
2278 public void setAll() {
2279 for (int i = 0; i < bits.length; i++) {
2280 bits[i] = 0xffffffff;
2284 @SuppressWarnings("unused")
2285 public void resetAll() {
2286 for (int i = 0; i < bits.length; i++) {
2291 public int countMembers() {
2292 // This bit counter is good for sparse numbers of '1's, which is
2293 // very much the case that we will usually have.
2295 for (int i = 0; i < bits.length; i++) {
2299 x &= (x - 1); // AND off the least significant one bit.
2300 // Note - Java integer over/underflow behavior is well defined.
2301 // 0x80000000 - 1 = 0x7fffffff
2307 private int[] bits = new int[6];