X-Git-Url: http://gitweb.fperrin.net/?a=blobdiff_plain;f=jars%2Ficu4j-4_2_1-src%2Fsrc%2Fcom%2Fibm%2Ficu%2Ftext%2FUnicodeSet.java;h=a28fb46c81e7b23b0d0b10fb7c04ef4fb13ed31a;hb=127973afabe0c34015667c599d68bf9453d85652;hp=83d50d4122603d88f4a2b604f0e40d64790e56e5;hpb=92dfc8b7d39cbc2e55f3c547c0c265bc7ae3af86;p=Dictionary.git diff --git a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/UnicodeSet.java b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/UnicodeSet.java old mode 100755 new mode 100644 index 83d50d4..a28fb46 --- a/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/UnicodeSet.java +++ b/jars/icu4j-4_2_1-src/src/com/ibm/icu/text/UnicodeSet.java @@ -1,3773 +1,3773 @@ -//##header -/* - ******************************************************************************* - * Copyright (C) 1996-2009, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ -package com.ibm.icu.text; - -import java.text.*; -import com.ibm.icu.lang.*; - -import java.io.IOException; - -import com.ibm.icu.impl.NormalizerImpl; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.impl.UCharacterProperty; -import com.ibm.icu.impl.UBiDiProps; -import com.ibm.icu.impl.UCaseProps; -import com.ibm.icu.impl.UPropertyAliases; -import com.ibm.icu.impl.SortedSetRelation; -import com.ibm.icu.impl.RuleCharacterIterator; - -import com.ibm.icu.util.Freezable; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - -import com.ibm.icu.text.BreakIterator; - -import java.util.MissingResourceException; -import java.util.TreeSet; -import java.util.Iterator; -import java.util.Collection; - -/** - * A mutable set of Unicode characters and multicharacter strings. Objects of this class - * represent character classes used in regular expressions. - * A character specifies a subset of Unicode code points. Legal - * code points are U+0000 to U+10FFFF, inclusive. - * - *
The UnicodeSet class is not designed to be subclassed. - * - *
UnicodeSet
supports two APIs. The first is the
- * operand API that allows the caller to modify the value of
- * a UnicodeSet
object. It conforms to Java 2's
- * java.util.Set
interface, although
- * UnicodeSet
does not actually implement that
- * interface. All methods of Set
are supported, with the
- * modification that they take a character range or single character
- * instead of an Object
, and they take a
- * UnicodeSet
instead of a Collection
. The
- * operand API may be thought of in terms of boolean logic: a boolean
- * OR is implemented by add
, a boolean AND is implemented
- * by retain
, a boolean XOR is implemented by
- * complement
taking an argument, and a boolean NOT is
- * implemented by complement
with no argument. In terms
- * of traditional set theory function names, add
is a
- * union, retain
is an intersection, remove
- * is an asymmetric difference, and complement
with no
- * argument is a set complement with respect to the superset range
- * MIN_VALUE-MAX_VALUE
- *
- *
The second API is the
- * applyPattern()
/toPattern()
API from the
- * java.text.Format
-derived classes. Unlike the
- * methods that add characters, add categories, and control the logic
- * of the set, the method applyPattern()
sets all
- * attributes of a UnicodeSet
at once, based on a
- * string pattern.
- *
- *
Pattern syntax
- * - * Patterns are accepted by the constructors and the - *applyPattern()
methods and returned by the
- * toPattern()
method. These patterns follow a syntax
- * similar to that employed by version 8 regular expression character
- * classes. Here are some simple examples:
- *
- * - *- * - * Any character may be preceded by a backslash in order to remove any special - * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are - * ignored, unless they are escaped. - * - *- *
- *- * - * []
No characters - *- * - * [a]
The character 'a' - *- * - *- * [ae]
The characters 'a' and 'e' - *- * - *- * [a-e]
The characters 'a' through 'e' inclusive, in Unicode code - * point order - *- * - *- * [\\u4E01]
The character U+4E01 - *- * - *- * [a{ab}{ac}]
The character 'a' and the multicharacter strings "ab" and - * "ac" - *- * - *- * [\p{Lu}]
All characters in the general category Uppercase Letter - *
Property patterns specify a set of characters having a certain - * property as defined by the Unicode standard. Both the POSIX-like - * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a - * complete list of supported property patterns, see the User's Guide - * for UnicodeSet at - * - * http://www.icu-project.org/userguide/unicodeSet.html. - * Actual determination of property data is defined by the underlying - * Unicode database as implemented by UCharacter. - * - *
Patterns specify individual characters, ranges of characters, and - * Unicode property sets. When elements are concatenated, they - * specify their union. To complement a set, place a '^' immediately - * after the opening '['. Property patterns are inverted by modifying - * their delimiters; "[:^foo]" and "\P{foo}". In any other location, - * '^' has no special meaning. - * - *
Ranges are indicated by placing two a '-' between two - * characters, as in "a-z". This specifies the range of all - * characters from the left to the right, in Unicode order. If the - * left character is greater than or equal to the - * right character it is a syntax error. If a '-' occurs as the first - * character after the opening '[' or '[^', or if it occurs as the - * last character before the closing ']', then it is taken as a - * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same - * set of three characters, 'a', 'b', and '-'. - * - *
Sets may be intersected using the '&' operator or the asymmetric - * set difference may be taken using the '-' operator, for example, - * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters - * with values less than 4096. Operators ('&' and '|') have equal - * precedence and bind left-to-right. Thus - * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to - * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for - * difference; intersection is commutative. - * - *
[a] | The set containing 'a' - * |
[a-z] | The set containing 'a' - * through 'z' and all letters in between, in Unicode order - * |
[^a-z] | The set containing - * all characters but 'a' through 'z', - * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF - * |
[[pat1][pat2]]
- * | The union of sets specified by pat1 and pat2 - * |
[[pat1]&[pat2]]
- * | The intersection of sets specified by pat1 and pat2 - * |
[[pat1]-[pat2]]
- * | The asymmetric difference of sets specified by pat1 and - * pat2 - * |
[:Lu:] or \p{Lu}
- * | The set of characters having the specified - * Unicode property; in - * this case, Unicode uppercase letters - * |
[:^Lu:] or \P{Lu}
- * | The set of characters not having the given - * Unicode property - * |
Warning: you cannot add an empty string ("") to a UnicodeSet.
- * - *Formal syntax
- * - *- *- *- *
- *- * - *- * pattern :=
- * ('[' '^'? item* ']') | - * property
- * - *- * item :=
- * char | (char '-' char) | pattern-expr
- *- * - *- * pattern-expr :=
- * pattern | pattern-expr pattern | - * pattern-expr op pattern
- *- * - *- * op :=
- * '&' | '-'
- *- * - *- * special :=
- * '[' | ']' | '-'
- *- * - *- * char :=
any character that is not - *special
any character
- * | ('\\')
- * | ('\u' hex hex hex hex)
- *- * - *- * hex :=
any character for which - * - *Character.digit(c, 16)
- * returns a non-negative result- * - *- * property :=
a Unicode property set pattern - *
- *- *
- *- * - *Legend: - *- *
- *- * - *- * a := b
- * - * a
may be replaced byb
- * - *- * a?
- * zero or one instance of - *a
- *- * - *- * a*
- * one or more instances of - *a
- *- * - *- * a | b
- * either - *a
orb
- *- * - *- * 'a'
- * the literal string between the quotes - *
To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
- *
- * @author Alan Liu
- * @stable ICU 2.0
- * @see UnicodeSetIterator
- */
-public class UnicodeSet extends UnicodeFilter implements Freezable {
-
- private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
- private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
- // 110000 for codepoints
-
- /**
- * Minimum value that can be stored in a UnicodeSet.
- * @stable ICU 2.0
- */
- public static final int MIN_VALUE = LOW;
-
- /**
- * Maximum value that can be stored in a UnicodeSet.
- * @stable ICU 2.0
- */
- public static final int MAX_VALUE = HIGH - 1;
-
- private int len; // length used; list may be longer to minimize reallocs
- private int[] list; // MUST be terminated with HIGH
- private int[] rangeList; // internal buffer
- private int[] buffer; // internal buffer
-
- // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
- // is not private so that UnicodeSetIterator can get access
- TreeSet strings = new TreeSet();
-
- /**
- * The pattern representation of this set. This may not be the
- * most economical pattern. It is the pattern supplied to
- * applyPattern(), with variables substituted and whitespace
- * removed. For sets constructed without applyPattern(), or
- * modified using the non-pattern API, this string will be null,
- * indicating that toPattern() must generate a pattern
- * representation from the inversion list.
- */
- private String pat = null;
-
- private static final int START_EXTRA = 16; // initial storage. Must be >= 0
- private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
-
- // Special property set IDs
- private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF]
- private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
- private static final String ASSIGNED = "Assigned"; // [:^Cn:]
-
- /**
- * A set of all characters _except_ the second through last characters of
- * certain ranges. These ranges are ranges of characters whose
- * properties are all exactly alike, e.g. CJK Ideographs from
- * U+4E00 to U+9FA5.
- */
- private static UnicodeSet INCLUSIONS[] = null;
-
- //----------------------------------------------------------------
- // Public API
- //----------------------------------------------------------------
-
- /**
- * Constructs an empty set.
- * @stable ICU 2.0
- */
- public UnicodeSet() {
- list = new int[1 + START_EXTRA];
- list[len++] = HIGH;
- }
-
- /**
- * Constructs a copy of an existing set.
- * @stable ICU 2.0
- */
- public UnicodeSet(UnicodeSet other) {
- set(other);
- }
-
- /**
- * Constructs a set containing the given range. If The UnicodeSet class is not designed to be subclassed.
+ *
+ * The second API is the
+ * Pattern syntax Property patterns specify a set of characters having a certain
+ * property as defined by the Unicode standard. Both the POSIX-like
+ * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
+ * complete list of supported property patterns, see the User's Guide
+ * for UnicodeSet at
+ *
+ * http://www.icu-project.org/userguide/unicodeSet.html.
+ * Actual determination of property data is defined by the underlying
+ * Unicode database as implemented by UCharacter.
+ *
+ * Patterns specify individual characters, ranges of characters, and
+ * Unicode property sets. When elements are concatenated, they
+ * specify their union. To complement a set, place a '^' immediately
+ * after the opening '['. Property patterns are inverted by modifying
+ * their delimiters; "[:^foo]" and "\P{foo}". In any other location,
+ * '^' has no special meaning.
+ *
+ * Ranges are indicated by placing two a '-' between two
+ * characters, as in "a-z". This specifies the range of all
+ * characters from the left to the right, in Unicode order. If the
+ * left character is greater than or equal to the
+ * right character it is a syntax error. If a '-' occurs as the first
+ * character after the opening '[' or '[^', or if it occurs as the
+ * last character before the closing ']', then it is taken as a
+ * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
+ * set of three characters, 'a', 'b', and '-'.
+ *
+ * Sets may be intersected using the '&' operator or the asymmetric
+ * set difference may be taken using the '-' operator, for example,
+ * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
+ * with values less than 4096. Operators ('&' and '|') have equal
+ * precedence and bind left-to-right. Thus
+ * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
+ * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
+ * difference; intersection is commutative.
+ *
+ * Warning: you cannot add an empty string ("") to a UnicodeSet. Formal syntax To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
+ *
+ * @author Alan Liu
+ * @stable ICU 2.0
+ * @see UnicodeSetIterator
+ */
+public class UnicodeSet extends UnicodeFilter implements Freezable {
+
+ private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
+ private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
+ // 110000 for codepoints
+
+ /**
+ * Minimum value that can be stored in a UnicodeSet.
+ * @stable ICU 2.0
+ */
+ public static final int MIN_VALUE = LOW;
+
+ /**
+ * Maximum value that can be stored in a UnicodeSet.
+ * @stable ICU 2.0
+ */
+ public static final int MAX_VALUE = HIGH - 1;
+
+ private int len; // length used; list may be longer to minimize reallocs
+ private int[] list; // MUST be terminated with HIGH
+ private int[] rangeList; // internal buffer
+ private int[] buffer; // internal buffer
+
+ // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
+ // is not private so that UnicodeSetIterator can get access
+ TreeSet strings = new TreeSet();
+
+ /**
+ * The pattern representation of this set. This may not be the
+ * most economical pattern. It is the pattern supplied to
+ * applyPattern(), with variables substituted and whitespace
+ * removed. For sets constructed without applyPattern(), or
+ * modified using the non-pattern API, this string will be null,
+ * indicating that toPattern() must generate a pattern
+ * representation from the inversion list.
+ */
+ private String pat = null;
+
+ private static final int START_EXTRA = 16; // initial storage. Must be >= 0
+ private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
+
+ // Special property set IDs
+ private static final String ANY_ID = "ANY"; // [\u0000-\U0010FFFF]
+ private static final String ASCII_ID = "ASCII"; // [\u0000-\u007F]
+ private static final String ASSIGNED = "Assigned"; // [:^Cn:]
+
+ /**
+ * A set of all characters _except_ the second through last characters of
+ * certain ranges. These ranges are ranges of characters whose
+ * properties are all exactly alike, e.g. CJK Ideographs from
+ * U+4E00 to U+9FA5.
+ */
+ private static UnicodeSet INCLUSIONS[] = null;
+
+ //----------------------------------------------------------------
+ // Public API
+ //----------------------------------------------------------------
+
+ /**
+ * Constructs an empty set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet() {
+ list = new int[1 + START_EXTRA];
+ list[len++] = HIGH;
+ }
+
+ /**
+ * Constructs a copy of an existing set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet(UnicodeSet other) {
+ set(other);
+ }
+
+ /**
+ * Constructs a set containing the given range. If end >
- * start
then an empty set is created.
- *
- * @param start first character, inclusive, of range
- * @param end last character, inclusive, of range
- * @stable ICU 2.0
- */
- public UnicodeSet(int start, int end) {
- this();
- complement(start, end);
- }
-
- /**
- * Constructs a set from the given pattern. See the class description
- * for the syntax of the pattern language. Whitespace is ignored.
- * @param pattern a string specifying what characters are in the set
- * @exception java.lang.IllegalArgumentException if the pattern contains
- * a syntax error.
- * @stable ICU 2.0
- */
- public UnicodeSet(String pattern) {
- this();
- applyPattern(pattern, null, null, IGNORE_SPACE);
- }
-
- /**
- * Constructs a set from the given pattern. See the class description
- * for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param ignoreWhitespace if true, ignore characters for which
- * UCharacterProperty.isRuleWhiteSpace() returns true
- * @exception java.lang.IllegalArgumentException if the pattern contains
- * a syntax error.
- * @stable ICU 2.0
- */
- public UnicodeSet(String pattern, boolean ignoreWhitespace) {
- this();
- applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
- }
-
- /**
- * Constructs a set from the given pattern. See the class description
- * for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
- * @exception java.lang.IllegalArgumentException if the pattern contains
- * a syntax error.
- * @stable ICU 3.8
- */
- public UnicodeSet(String pattern, int options) {
- this();
- applyPattern(pattern, null, null, options);
- }
-
- /**
- * Constructs a set from the given pattern. See the class description
- * for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param pos on input, the position in pattern at which to start parsing.
- * On output, the position after the last character parsed.
- * @param symbols a symbol table mapping variables to char[] arrays
- * and chars to UnicodeSets
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
- * @stable ICU 2.0
- */
- public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
- this();
- applyPattern(pattern, pos, symbols, IGNORE_SPACE);
- }
-
- /**
- * Constructs a set from the given pattern. See the class description
- * for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param pos on input, the position in pattern at which to start parsing.
- * On output, the position after the last character parsed.
- * @param symbols a symbol table mapping variables to char[] arrays
- * and chars to UnicodeSets
- * @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
- * @stable ICU 3.2
- */
- public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
- this();
- applyPattern(pattern, pos, symbols, options);
- }
-
-
- /**
- * Return a new set that is equivalent to this one.
- * @stable ICU 2.0
- */
- public Object clone() {
- UnicodeSet result = new UnicodeSet(this);
- result.frozen = this.frozen;
- return result;
- }
-
- /**
- * Make this object represent the range start - end
.
- * If end > start
then this object is set to an
- * an empty range.
- *
- * @param start first character in the set, inclusive
- * @param end last character in the set, inclusive
- * @stable ICU 2.0
- */
- public UnicodeSet set(int start, int end) {
- checkFrozen();
- clear();
- complement(start, end);
- return this;
- }
-
- /**
- * Make this object represent the same set as other
.
- * @param other a UnicodeSet
whose value will be
- * copied to this object
- * @stable ICU 2.0
- */
- public UnicodeSet set(UnicodeSet other) {
- checkFrozen();
- list = (int[]) other.list.clone();
- len = other.len;
- pat = other.pat;
- strings = (TreeSet)other.strings.clone();
- return this;
- }
-
- /**
- * Modifies this set to represent the set specified by the given pattern.
- * See the class description for the syntax of the pattern language.
- * Whitespace is ignored.
- * @param pattern a string specifying what characters are in the set
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
- * @stable ICU 2.0
- */
- public final UnicodeSet applyPattern(String pattern) {
- checkFrozen();
- return applyPattern(pattern, null, null, IGNORE_SPACE);
- }
-
- /**
- * Modifies this set to represent the set specified by the given pattern,
- * optionally ignoring whitespace.
- * See the class description for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param ignoreWhitespace if true then characters for which
- * UCharacterProperty.isRuleWhiteSpace() returns true are ignored
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
- * @stable ICU 2.0
- */
- public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
- checkFrozen();
- return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
- }
-
- /**
- * Modifies this set to represent the set specified by the given pattern,
- * optionally ignoring whitespace.
- * See the class description for the syntax of the pattern language.
- * @param pattern a string specifying what characters are in the set
- * @param options a bitmask indicating which options to apply.
- * Valid options are IGNORE_SPACE and CASE.
- * @exception java.lang.IllegalArgumentException if the pattern
- * contains a syntax error.
- * @stable ICU 3.8
- */
- public UnicodeSet applyPattern(String pattern, int options) {
- checkFrozen();
- return applyPattern(pattern, null, null, options);
- }
-
- /**
- * Return true if the given position, in the given pattern, appears
- * to be the start of a UnicodeSet pattern.
- * @stable ICU 2.0
- */
- public static boolean resemblesPattern(String pattern, int pos) {
- return ((pos+1) < pattern.length() &&
- pattern.charAt(pos) == '[') ||
- resemblesPropertyPattern(pattern, pos);
- }
-
- /**
- * Append the toPattern()
representation of a
- * string to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
- _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
- }
- }
-
- /**
- * Append the toPattern()
representation of a
- * character to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // Use hex escape notation (charAt()
.
- * @return an index from 0..size()-1, or -1
- * @stable ICU 2.0
- */
- public int indexOf(int c) {
- if (c < MIN_VALUE || c > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
- }
- int i = 0;
- int n = 0;
- for (;;) {
- int start = list[i++];
- if (c < start) {
- return -1;
- }
- int limit = list[i++];
- if (c < limit) {
- return n + c - start;
- }
- n += limit - start;
- }
- }
-
- /**
- * Returns the character at the given index within this set, where
- * the set is ordered by ascending code point. If the index is
- * out of range, return -1. The inverse of this method is
- * indexOf()
.
- * @param index an index from 0..size()-1
- * @return the character at the given index, or -1.
- * @stable ICU 2.0
- */
- public int charAt(int index) {
- if (index >= 0) {
- // len2 is the largest even integer <= len, that is, it is len
- // for even values and len-1 for odd values. With odd values
- // the last entry is UNICODESET_HIGH.
- int len2 = len & ~1;
- for (int i=0; i < len2;) {
- int start = list[i++];
- int count = list[i++] - start;
- if (index < count) {
- return start + index;
- }
- index -= count;
- }
- }
- return -1;
- }
-
- /**
- * Adds the specified range to this set if it is not already
- * present. If this set already contains the specified range,
- * the call leaves this set unchanged. If end > start
- * then an empty range is added, leaving the set unchanged.
- *
- * @param start first character, inclusive, of range to be added
- * to this set.
- * @param end last character, inclusive, of range to be added
- * to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet add(int start, int end) {
- checkFrozen();
- return add_unchecked(start, end);
- }
-
- // for internal use, after checkFrozen has been called
- private UnicodeSet add_unchecked(int start, int end) {
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- if (start < end) {
- add(range(start, end), 2, 0);
- } else if (start == end) {
- add(start);
- }
- return this;
- }
-
-// /**
-// * Format out the inversion list as a string, for debugging. Uncomment when
-// * needed.
-// */
-// public final String dump() {
-// StringBuffer buf = new StringBuffer("[");
-// for (int i=0; i
Warning: you cannot add an empty string ("") to a UnicodeSet.
- * @param s the source string
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet add(String s) {
- checkFrozen();
- int cp = getSingleCP(s);
- if (cp < 0) {
- strings.add(s);
- pat = null;
- } else {
- add_unchecked(cp, cp);
- }
- return this;
- }
-
- /**
- * @return a code point IF the string consists of a single one.
- * otherwise returns -1.
- * @param string to test
- */
- private static int getSingleCP(String s) {
- if (s.length() < 1) {
- throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
- }
- if (s.length() > 2) return -1;
- if (s.length() == 1) return s.charAt(0);
-
- // at this point, len = 2
- int cp = UTF16.charAt(s, 0);
- if (cp > 0xFFFF) { // is surrogate pair
- return cp;
- }
- return -1;
- }
-
- /**
- * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
- * @param s the source string
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet addAll(String s) {
- checkFrozen();
- int cp;
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
- cp = UTF16.charAt(s, i);
- add_unchecked(cp, cp);
- }
- return this;
- }
-
- /**
- * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
- * @param s the source string
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet retainAll(String s) {
- return retainAll(fromAll(s));
- }
-
- /**
- * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
- * @param s the source string
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet complementAll(String s) {
- return complementAll(fromAll(s));
- }
-
- /**
- * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
- * If this set already any particular character, it has no effect on that character.
- * @param s the source string
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet removeAll(String s) {
- return removeAll(fromAll(s));
- }
-
- /**
- * Remove all strings from this UnicodeSet
- * @return this object, for chaining
- * @draft ICU 4.2
- * @provisional This API might change or be removed in a future release.
- */
- public final UnicodeSet removeAllStrings() {
- checkFrozen();
- if (strings.size() != 0) {
- strings.clear();
- pat = null;
- }
- return this;
- }
-
- /**
- * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
- *
Warning: you cannot add an empty string ("") to a UnicodeSet.
- * @param s the source string
- * @return a newly created set containing the given string
- * @stable ICU 2.0
- */
- public static UnicodeSet from(String s) {
- return new UnicodeSet().add(s);
- }
-
-
- /**
- * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
- * @param s the source string
- * @return a newly created set containing the given characters
- * @stable ICU 2.0
- */
- public static UnicodeSet fromAll(String s) {
- return new UnicodeSet().addAll(s);
- }
-
-
- /**
- * Retain only the elements in this set that are contained in the
- * specified range. If end > start
then an empty range is
- * retained, leaving the set empty.
- *
- * @param start first character, inclusive, of range to be retained
- * to this set.
- * @param end last character, inclusive, of range to be retained
- * to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet retain(int start, int end) {
- checkFrozen();
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- if (start <= end) {
- retain(range(start, end), 2, 0);
- } else {
- clear();
- }
- return this;
- }
-
- /**
- * Retain the specified character from this set if it is present.
- * Upon return this set will be empty if it did not contain c, or
- * will only contain c if it did contain c.
- * @param c the character to be retained
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet retain(int c) {
- return retain(c, c);
- }
-
- /**
- * Retain the specified string in this set if it is present.
- * Upon return this set will be empty if it did not contain s, or
- * will only contain s if it did contain s.
- * @param s the string to be retained
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet retain(String s) {
- int cp = getSingleCP(s);
- if (cp < 0) {
- boolean isIn = strings.contains(s);
- if (isIn && size() == 1) {
- return this;
- }
- clear();
- strings.add(s);
- pat = null;
- } else {
- retain(cp, cp);
- }
- return this;
- }
-
- /**
- * Removes the specified range from this set if it is present.
- * The set will not contain the specified range once the call
- * returns. If end > start
then an empty range is
- * removed, leaving the set unchanged.
- *
- * @param start first character, inclusive, of range to be removed
- * from this set.
- * @param end last character, inclusive, of range to be removed
- * from this set.
- * @stable ICU 2.0
- */
- public UnicodeSet remove(int start, int end) {
- checkFrozen();
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- if (start <= end) {
- retain(range(start, end), 2, 2);
- }
- return this;
- }
-
- /**
- * Removes the specified character from this set if it is present.
- * The set will not contain the specified character once the call
- * returns.
- * @param c the character to be removed
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet remove(int c) {
- return remove(c, c);
- }
-
- /**
- * Removes the specified string from this set if it is present.
- * The set will not contain the specified string once the call
- * returns.
- * @param s the string to be removed
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet remove(String s) {
- int cp = getSingleCP(s);
- if (cp < 0) {
- strings.remove(s);
- pat = null;
- } else {
- remove(cp, cp);
- }
- return this;
- }
-
- /**
- * Complements the specified range in this set. Any character in
- * the range will be removed if it is in this set, or will be
- * added if it is not in this set. If end > start
- * then an empty range is complemented, leaving the set unchanged.
- *
- * @param start first character, inclusive, of range to be removed
- * from this set.
- * @param end last character, inclusive, of range to be removed
- * from this set.
- * @stable ICU 2.0
- */
- public UnicodeSet complement(int start, int end) {
- checkFrozen();
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- if (start <= end) {
- xor(range(start, end), 2, 0);
- }
- pat = null;
- return this;
- }
-
- /**
- * Complements the specified character in this set. The character
- * will be removed if it is in this set, or will be added if it is
- * not in this set.
- * @stable ICU 2.0
- */
- public final UnicodeSet complement(int c) {
- return complement(c, c);
- }
-
- /**
- * This is equivalent to
- * complement(MIN_VALUE, MAX_VALUE)
.
- * @stable ICU 2.0
- */
- public UnicodeSet complement() {
- checkFrozen();
- if (list[0] == LOW) {
- System.arraycopy(list, 1, list, 0, len-1);
- --len;
- } else {
- ensureCapacity(len+1);
- System.arraycopy(list, 0, list, 1, len);
- list[0] = LOW;
- ++len;
- }
- pat = null;
- return this;
- }
-
- /**
- * Complement the specified string in this set.
- * The set will not contain the specified string once the call
- * returns.
- *
Warning: you cannot add an empty string ("") to a UnicodeSet.
- * @param s the string to complement
- * @return this object, for chaining
- * @stable ICU 2.0
- */
- public final UnicodeSet complement(String s) {
- checkFrozen();
- int cp = getSingleCP(s);
- if (cp < 0) {
- if (strings.contains(s)) strings.remove(s);
- else strings.add(s);
- pat = null;
- } else {
- complement(cp, cp);
- }
- return this;
- }
-
- /**
- * Returns true if this set contains the given character.
- * @param c character to be checked for containment
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean contains(int c) {
- if (c < MIN_VALUE || c > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
- }
-
- /*
- // Set i to the index of the start item greater than ch
- // We know we will terminate without length test!
- int i = -1;
- while (true) {
- if (c < list[++i]) break;
- }
- */
-
- int i = findCodePoint(c);
-
- return ((i & 1) != 0); // return true if odd
- }
-
- /**
- * Returns the smallest value i such that c < list[i]. Caller
- * must ensure that c is a legal value or this method will enter
- * an infinite loop. This method performs a binary search.
- * @param c a character in the range MIN_VALUE..MAX_VALUE
- * inclusive
- * @return the smallest integer i in the range 0..len-1,
- * inclusive, such that c < list[i]
- */
- private final int findCodePoint(int c) {
- /* Examples:
- findCodePoint(c)
- set list[] c=0 1 3 4 7 8
- === ============== ===========
- [] [110000] 0 0 0 0 0 0
- [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
- [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
- [:all:] [0, 110000] 1 1 1 1 1 1
- */
-
- // Return the smallest i such that c < list[i]. Assume
- // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
- if (c < list[0]) return 0;
- // High runner test. c is often after the last range, so an
- // initial check for this condition pays off.
- if (len >= 2 && c >= list[len-2]) return len-1;
- int lo = 0;
- int hi = len - 1;
- // invariant: c >= list[lo]
- // invariant: c < list[hi]
- for (;;) {
- int i = (lo + hi) >>> 1;
- if (i == lo) return hi;
- if (c < list[i]) {
- hi = i;
- } else {
- lo = i;
- }
- }
- }
-
-// //----------------------------------------------------------------
-// // Unrolled binary search
-// //----------------------------------------------------------------
-//
-// private int validLen = -1; // validated value of len
-// private int topOfLow;
-// private int topOfHigh;
-// private int power;
-// private int deltaStart;
-//
-// private void validate() {
-// if (len <= 1) {
-// throw new IllegalArgumentException("list.len==" + len + "; must be >1");
-// }
-//
-// // find greatest power of 2 less than or equal to len
-// for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
-//
-// // assert(exp2[power] <= len);
-//
-// // determine the starting points
-// topOfLow = exp2[power] - 1;
-// topOfHigh = len - 1;
-// deltaStart = exp2[power-1];
-// validLen = len;
-// }
-//
-// private static final int exp2[] = {
-// 0x1, 0x2, 0x4, 0x8,
-// 0x10, 0x20, 0x40, 0x80,
-// 0x100, 0x200, 0x400, 0x800,
-// 0x1000, 0x2000, 0x4000, 0x8000,
-// 0x10000, 0x20000, 0x40000, 0x80000,
-// 0x100000, 0x200000, 0x400000, 0x800000,
-// 0x1000000, 0x2000000, 0x4000000, 0x8000000,
-// 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
-// };
-//
-// /**
-// * Unrolled lowest index GT.
-// */
-// private final int leastIndexGT(int searchValue) {
-//
-// if (len != validLen) {
-// if (len == 1) return 0;
-// validate();
-// }
-// int temp;
-//
-// // set up initial range to search. Each subrange is a power of two in length
-// int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
-//
-// // Completely unrolled binary search, folhighing "Programming Pearls"
-// // Each case deliberately falls through to the next
-// // Logically, list[-1] < all_search_values && list[count] > all_search_values
-// // although the values -1 and count are never actually touched.
-//
-// // The bounds at each point are low & high,
-// // where low == high - delta*2
-// // so high - delta is the midpoint
-//
-// // The invariant AFTER each line is that list[low] < searchValue <= list[high]
-//
-// switch (power) {
-// //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
-// case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
-// case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
-//
-// case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
-// case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
-// case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
-// case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
-//
-// case 24: if (searchValue < list[temp = high- 0x800000]) high = temp;
-// case 23: if (searchValue < list[temp = high- 0x400000]) high = temp;
-// case 22: if (searchValue < list[temp = high- 0x200000]) high = temp;
-// case 21: if (searchValue < list[temp = high- 0x100000]) high = temp;
-//
-// case 20: if (searchValue < list[temp = high- 0x80000]) high = temp;
-// case 19: if (searchValue < list[temp = high- 0x40000]) high = temp;
-// case 18: if (searchValue < list[temp = high- 0x20000]) high = temp;
-// case 17: if (searchValue < list[temp = high- 0x10000]) high = temp;
-//
-// case 16: if (searchValue < list[temp = high- 0x8000]) high = temp;
-// case 15: if (searchValue < list[temp = high- 0x4000]) high = temp;
-// case 14: if (searchValue < list[temp = high- 0x2000]) high = temp;
-// case 13: if (searchValue < list[temp = high- 0x1000]) high = temp;
-//
-// case 12: if (searchValue < list[temp = high- 0x800]) high = temp;
-// case 11: if (searchValue < list[temp = high- 0x400]) high = temp;
-// case 10: if (searchValue < list[temp = high- 0x200]) high = temp;
-// case 9: if (searchValue < list[temp = high- 0x100]) high = temp;
-//
-// case 8: if (searchValue < list[temp = high- 0x80]) high = temp;
-// case 7: if (searchValue < list[temp = high- 0x40]) high = temp;
-// case 6: if (searchValue < list[temp = high- 0x20]) high = temp;
-// case 5: if (searchValue < list[temp = high- 0x10]) high = temp;
-//
-// case 4: if (searchValue < list[temp = high- 0x8]) high = temp;
-// case 3: if (searchValue < list[temp = high- 0x4]) high = temp;
-// case 2: if (searchValue < list[temp = high- 0x2]) high = temp;
-// case 1: if (searchValue < list[temp = high- 0x1]) high = temp;
-// }
-//
-// return high;
-// }
-//
-// // For debugging only
-// public int len() {
-// return len;
-// }
-//
-// //----------------------------------------------------------------
-// //----------------------------------------------------------------
-
- /**
- * Returns true if this set contains every character
- * of the given range.
- * @param start first character, inclusive, of the range
- * @param end last character, inclusive, of the range
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean contains(int start, int end) {
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- //int i = -1;
- //while (true) {
- // if (start < list[++i]) break;
- //}
- int i = findCodePoint(start);
- return ((i & 1) != 0 && end < list[i]);
- }
-
- /**
- * Returns true if this set contains the given
- * multicharacter string.
- * @param s string to be checked for containment
- * @return true if this set contains the specified string
- * @stable ICU 2.0
- */
- public final boolean contains(String s) {
-
- int cp = getSingleCP(s);
- if (cp < 0) {
- return strings.contains(s);
- } else {
- return contains(cp);
- }
- }
-
- /**
- * Returns true if this set contains all the characters and strings
- * of the given set.
- * @param b set to be checked for containment
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean containsAll(UnicodeSet b) {
- // The specified set is a subset if all of its pairs are contained in
- // this set. This implementation accesses the lists directly for speed.
- // TODO: this could be faster if size() were cached. But that would affect building speed
- // so it needs investigation.
- int[] listB = b.list;
- boolean needA = true;
- boolean needB = true;
- int aPtr = 0;
- int bPtr = 0;
- int aLen = len - 1;
- int bLen = b.len - 1;
- int startA = 0, startB = 0, limitA = 0, limitB = 0;
- while (true) {
- // double iterations are such a pain...
- if (needA) {
- if (aPtr >= aLen) {
- // ran out of A. If B is also exhausted, then break;
- if (needB && bPtr >= bLen) {
- break;
- }
- return false;
- }
- startA = list[aPtr++];
- limitA = list[aPtr++];
- }
- if (needB) {
- if (bPtr >= bLen) {
- // ran out of B. Since we got this far, we have an A and we are ok so far
- break;
- }
- startB = listB[bPtr++];
- limitB = listB[bPtr++];
- }
- // if B doesn't overlap and is greater than A, get new A
- if (startB >= limitA) {
- needA = true;
- needB = false;
- continue;
- }
- // if B is wholy contained in A, then get a new B
- if (startB >= startA && limitB <= limitA) {
- needA = false;
- needB = true;
- continue;
- }
- // all other combinations mean we fail
- return false;
- }
-
- if (!strings.containsAll(b.strings)) return false;
- return true;
- }
-
-// /**
-// * Returns true if this set contains all the characters and strings
-// * of the given set.
-// * @param c set to be checked for containment
-// * @return true if the test condition is met
-// * @stable ICU 2.0
-// */
-// public boolean containsAllOld(UnicodeSet c) {
-// // The specified set is a subset if all of its pairs are contained in
-// // this set. It's possible to code this more efficiently in terms of
-// // direct manipulation of the inversion lists if the need arises.
-// int n = c.getRangeCount();
-// for (int i=0; i
- * containsAll is false for each of: "acb", "bcda", "bcx"
- * @param s string containing characters to be checked for containment
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean containsAll(String s) {
- int cp;
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
- cp = UTF16.charAt(s, i);
- if (!contains(cp)) {
- if (strings.size() == 0) {
- return false;
- }
- return containsAll(s, 0);
- }
- }
- return true;
- }
-
- /**
- * Recursive routine called if we fail to find a match in containsAll, and there are strings
- * @param s source string
- * @param i point to match to the end on
- * @return true if ok
- */
- private boolean containsAll(String s, int i) {
- if (i >= s.length()) {
- return true;
- }
- int cp= UTF16.charAt(s, i);
- if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
- return true;
- }
-
- Iterator it = strings.iterator();
- while (it.hasNext()) {
- String setStr = (String)it.next();
- if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) {
- return true;
- }
- }
- return false;
-
- }
-
- /**
- * Get the Regex equivalent for this UnicodeSet
- * @return regex pattern equivalent to this UnicodeSet
- * @internal
- * @deprecated This API is ICU internal only.
- */
- public String getRegexEquivalent() {
- if (strings.size() == 0) return toString();
- StringBuffer result = new StringBuffer("(?:");
- _generatePattern(result, true, false);
- Iterator it = strings.iterator();
- while (it.hasNext()) {
- result.append('|');
- _appendToPat(result, (String) it.next(), true);
- }
- return result.append(")").toString();
- }
-
- /**
- * Returns true if this set contains none of the characters
- * of the given range.
- * @param start first character, inclusive, of the range
- * @param end last character, inclusive, of the range
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean containsNone(int start, int end) {
- if (start < MIN_VALUE || start > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
- }
- if (end < MIN_VALUE || end > MAX_VALUE) {
- throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
- }
- int i = -1;
- while (true) {
- if (start < list[++i]) break;
- }
- return ((i & 1) == 0 && end < list[i]);
- }
-
- /**
- * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
- * For example, for the Unicode set [a{bc}{cd}]
- * containsNone is true for: "xy", "cb"
- * containsNone is false for: "a", "bc", "bcd"
- * @param b set to be checked for containment
- * @return true if the test condition is met
- * @stable ICU 2.0
- */
- public boolean containsNone(UnicodeSet b) {
- // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
- // This implementation accesses the lists directly for speed.
- int[] listB = b.list;
- boolean needA = true;
- boolean needB = true;
- int aPtr = 0;
- int bPtr = 0;
- int aLen = len - 1;
- int bLen = b.len - 1;
- int startA = 0, startB = 0, limitA = 0, limitB = 0;
- while (true) {
- // double iterations are such a pain...
- if (needA) {
- if (aPtr >= aLen) {
- // ran out of A: break so we test strings
- break;
- }
- startA = list[aPtr++];
- limitA = list[aPtr++];
- }
- if (needB) {
- if (bPtr >= bLen) {
- // ran out of B: break so we test strings
- break;
- }
- startB = listB[bPtr++];
- limitB = listB[bPtr++];
- }
- // if B is higher than any part of A, get new A
- if (startB >= limitA) {
- needA = true;
- needB = false;
- continue;
- }
- // if A is higher than any part of B, get new B
- if (startA >= limitB) {
- needA = false;
- needB = true;
- continue;
- }
- // all other combinations mean we fail
- return false;
- }
-
- if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
- return true;
- }
-
-// /**
-// * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
-// * For example, for the Unicode set [a{bc}{cd}]
-// * containsNone is true for: "xy", "cb"
-// * containsNone is false for: "a", "bc", "bcd"
-// * @param c set to be checked for containment
-// * @return true if the test condition is met
-// * @stable ICU 2.0
-// */
-// public boolean containsNoneOld(UnicodeSet c) {
-// // The specified set is a subset if all of its pairs are contained in
-// // this set. It's possible to code this more efficiently in terms of
-// // direct manipulation of the inversion lists if the need arises.
-// int n = c.getRangeCount();
-// for (int i=0; i0..getRangeCount()-1
- * @see #getRangeCount
- * @see #getRangeEnd
- * @stable ICU 2.0
- */
- public int getRangeStart(int index) {
- return list[index*2];
- }
-
- /**
- * Iteration method that returns the last character in the
- * specified range of this set.
- * @exception ArrayIndexOutOfBoundsException if index is outside
- * the range 0..getRangeCount()-1
- * @see #getRangeStart
- * @see #getRangeEnd
- * @stable ICU 2.0
- */
- public int getRangeEnd(int index) {
- return (list[index*2 + 1] - 1);
- }
-
- /**
- * Reallocate this objects internal structures to take up the least
- * possible space, without changing this object's value.
- * @stable ICU 2.0
- */
- public UnicodeSet compact() {
- checkFrozen();
- if (len != list.length) {
- int[] temp = new int[len];
- System.arraycopy(list, 0, temp, 0, len);
- list = temp;
- }
- rangeList = null;
- buffer = null;
- return this;
- }
-
- /**
- * Compares the specified object with this set for equality. Returns
- * true if the specified object is also a set, the two sets
- * have the same size, and every member of the specified set is
- * contained in this set (or equivalently, every member of this set is
- * contained in the specified set).
- *
- * @param o Object to be compared for equality with this set.
- * @return true if the specified Object is equal to this set.
- * @stable ICU 2.0
- */
- public boolean equals(Object o) {
- try {
- UnicodeSet that = (UnicodeSet) o;
- if (len != that.len) return false;
- for (int i = 0; i < len; ++i) {
- if (list[i] != that.list[i]) return false;
- }
- if (!strings.equals(that.strings)) return false;
- } catch (Exception e) {
- return false;
- }
- return true;
- }
-
- /**
- * Returns the hash code value for this set.
- *
- * @return the hash code value for this set.
- * @see java.lang.Object#hashCode()
- * @stable ICU 2.0
- */
- public int hashCode() {
- int result = len;
- for (int i = 0; i < len; ++i) {
- result *= 1000003;
- result += list[i];
- }
- return result;
- }
-
- /**
- * Return a programmer-readable string representation of this object.
- * @stable ICU 2.0
- */
- public String toString() {
- return toPattern(true);
- }
-
- //----------------------------------------------------------------
- // Implementation: Pattern parsing
- //----------------------------------------------------------------
-
- /**
- * Parses the given pattern, starting at the given position. The character
- * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
- * Parsing continues until the corresponding closing ']'. If a syntax error
- * is encountered between the opening and closing brace, the parse fails.
- * Upon return from a successful parse, the ParsePosition is updated to
- * point to the character following the closing ']', and an inversion
- * list for the parsed pattern is returned. This method
- * calls itself recursively to parse embedded subpatterns.
- *
- * @param pattern the string containing the pattern to be parsed. The
- * portion of the string from pos.getIndex(), which must be a '[', to the
- * corresponding closing ']', is parsed.
- * @param pos upon entry, the position at which to being parsing. The
- * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
- * from a successful parse, pos.getIndex() is either the character after the
- * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
- * is the last character of the pattern string.
- * @return an inversion list for the parsed substring
- * of pattern
- * @exception java.lang.IllegalArgumentException if the parse fails.
- * @internal
- * @deprecated - for internal use only
- */
- public UnicodeSet applyPattern(String pattern,
- ParsePosition pos,
- SymbolTable symbols,
- int options) {
-
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- boolean parsePositionWasNull = pos == null;
- if (parsePositionWasNull) {
- pos = new ParsePosition(0);
- }
-
- StringBuffer rebuiltPat = new StringBuffer();
- RuleCharacterIterator chars =
- new RuleCharacterIterator(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options);
- if (chars.inVariable()) {
- syntaxError(chars, "Extra chars in variable value");
- }
- pat = rebuiltPat.toString();
- if (parsePositionWasNull) {
- int i = pos.getIndex();
-
- // Skip over trailing whitespace
- if ((options & IGNORE_SPACE) != 0) {
- i = Utility.skipWhitespace(pattern, i);
- }
-
- if (i != pattern.length()) {
- throw new IllegalArgumentException("Parse of \"" + pattern +
- "\" failed at " + i);
- }
- }
- return this;
- }
-
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
- StringBuffer rebuiltPat, int options) {
-
- // Syntax characters: [ ] ^ - & { }
-
- // Recognized special forms for chars, sets: c-c s-s s&s
-
- int opts = RuleCharacterIterator.PARSE_VARIABLES |
- RuleCharacterIterator.PARSE_ESCAPES;
- if ((options & IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator.SKIP_WHITESPACE;
- }
-
- StringBuffer patBuf = new StringBuffer(), buf = null;
- boolean usePat = false;
- UnicodeSet scratch = null;
- Object backup = null;
-
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int lastItem = 0, lastChar = 0, mode = 0;
- char op = 0;
-
- boolean invert = false;
-
- clear();
-
- while (mode != 2 && !chars.atEnd()) {
- if (false) {
- // Debugging assertion
- if (!((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == '-')) ||
- (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
- throw new IllegalArgumentException();
- }
- }
-
- int c = 0;
- boolean literal = false;
- UnicodeSet nested = null;
-
- // -------- Check for property pattern
-
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
-
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
-
- else {
- // Prepare to backup if necessary
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
-
- if (c == '[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patBuf.append('[');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '^' && !literal) {
- invert = true;
- patBuf.append('^');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == '-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != null) {
- UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
- if (m != null) {
- try {
- nested = (UnicodeSet) m;
- setMode = 3;
- } catch (ClassCastException e) {
- syntaxError(chars, "Syntax error");
- }
- }
- }
- }
-
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
-
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- syntaxError(chars, "Char expected after operator");
- }
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastItem = op = 0;
- }
-
- if (op == '-' || op == '&') {
- patBuf.append(op);
- }
-
- if (nested == null) {
- if (scratch == null) scratch = new UnicodeSet();
- nested = scratch;
- }
- switch (setMode) {
- case 1:
- nested.applyPattern(chars, symbols, patBuf, options);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested.applyPropertyPattern(chars, patBuf, symbols);
- break;
- case 3: // `nested' already parsed
- nested._toPattern(patBuf, false);
- break;
- }
-
- usePat = true;
-
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- set(nested);
- mode = 2;
- break;
- }
-
- switch (op) {
- case '-':
- removeAll(nested);
- break;
- case '&':
- retainAll(nested);
- break;
- case 0:
- addAll(nested);
- break;
- }
-
- op = 0;
- lastItem = 2;
-
- continue;
- }
-
- if (mode == 0) {
- syntaxError(chars, "Missing '['");
- }
-
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
-
- if (!literal) {
- switch (c) {
- case ']':
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == '-') {
- add_unchecked(op, op);
- patBuf.append(op);
- } else if (op == '&') {
- syntaxError(chars, "Trailing '&'");
- }
- patBuf.append(']');
- mode = 2;
- continue;
- case '-':
- if (op == 0) {
- if (lastItem != 0) {
- op = (char) c;
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add_unchecked(c, c);
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == ']' && !literal) {
- patBuf.append("-]");
- mode = 2;
- continue;
- }
- }
- }
- syntaxError(chars, "'-' not after char or set");
- case '&':
- if (lastItem == 2 && op == 0) {
- op = (char) c;
- continue;
- }
- syntaxError(chars, "'&' not after set");
- case '^':
- syntaxError(chars, "'^' not after '['");
- case '{':
- if (op != 0) {
- syntaxError(chars, "Missing operand after operator");
- }
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- lastItem = 0;
- if (buf == null) {
- buf = new StringBuffer();
- } else {
- buf.setLength(0);
- }
- boolean ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '}' && !literal) {
- ok = true;
- break;
- }
- UTF16.append(buf, c);
- }
- if (buf.length() < 1 || !ok) {
- syntaxError(chars, "Invalid multicharacter string");
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf.toString());
- patBuf.append('{');
- _appendToPat(patBuf, buf.toString(), false);
- patBuf.append('}');
- continue;
- case SymbolTable.SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
- boolean anchor = (c == ']' && !literal);
- if (symbols == null && !anchor) {
- c = SymbolTable.SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- add_unchecked(UnicodeMatcher.ETHER);
- usePat = true;
- patBuf.append(SymbolTable.SYMBOL_REF).append(']');
- mode = 2;
- continue;
- }
- syntaxError(chars, "Unquoted '$'");
- default:
- break;
- }
- }
-
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
-
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == '-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- syntaxError(chars, "Invalid range");
- }
- add_unchecked(lastChar, c);
- _appendToPat(patBuf, lastChar, false);
- patBuf.append(op);
- _appendToPat(patBuf, c, false);
- lastItem = op = 0;
- } else {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- syntaxError(chars, "Set expected after operator");
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
-
- if (mode != 2) {
- syntaxError(chars, "Missing ']'");
- }
-
- chars.skipIgnored(opts);
-
- /**
- * Handle global flags (invert, case insensitivity). If this
- * pattern should be compiled case-insensitive, then we need
- * to close over case BEFORE COMPLEMENTING. This makes
- * patterns like /[^abc]/i work.
- */
- if ((options & CASE) != 0) {
- closeOver(CASE);
- }
- if (invert) {
- complement();
- }
-
- // Use the rebuilt pattern (pat) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patBuf.toString());
- } else {
- _generatePattern(rebuiltPat, false, true);
- }
- }
-
- private static void syntaxError(RuleCharacterIterator chars, String msg) {
- throw new IllegalArgumentException("Error: " + msg + " at \"" +
- Utility.escape(chars.toString()) +
- '"');
- }
-
- /**
- * Add the contents of the UnicodeSet (as strings) into a collection.
- * @param target collection to add into
- * @stable ICU 2.8
- */
- public void addAllTo(Collection target) {
- UnicodeSetIterator it = new UnicodeSetIterator(this);
- while (it.next()) {
- target.add(it.getString());
- }
- }
-
- /**
- * Add the contents of the collection (as strings) into this UnicodeSet.
- * @param source the collection to add
- * @stable ICU 2.8
- */
- public void addAll(Collection source) {
- checkFrozen();
- Iterator it = source.iterator();
- while (it.hasNext()) {
- add(it.next().toString());
- }
- }
-
- //----------------------------------------------------------------
- // Implementation: Utility methods
- //----------------------------------------------------------------
-
- private void ensureCapacity(int newLen) {
- if (newLen <= list.length) return;
- int[] temp = new int[newLen + GROW_EXTRA];
- System.arraycopy(list, 0, temp, 0, len);
- list = temp;
- }
-
- private void ensureBufferCapacity(int newLen) {
- if (buffer != null && newLen <= buffer.length) return;
- buffer = new int[newLen + GROW_EXTRA];
- }
-
- /**
- * Assumes start <= end.
- */
- private int[] range(int start, int end) {
- if (rangeList == null) {
- rangeList = new int[] { start, end+1, HIGH };
- } else {
- rangeList[0] = start;
- rangeList[1] = end+1;
- }
- return rangeList;
- }
-
- //----------------------------------------------------------------
- // Implementation: Fundamental operations
- //----------------------------------------------------------------
-
- // polarity = 0, 3 is normal: x xor y
- // polarity = 1, 2: x xor ~y == x === y
-
- private UnicodeSet xor(int[] other, int otherLen, int polarity) {
- ensureBufferCapacity(len + otherLen);
- int i = 0, j = 0, k = 0;
- int a = list[i++];
- int b;
- if (polarity == 1 || polarity == 2) {
- b = LOW;
- if (other[j] == LOW) { // skip base if already LOW
- ++j;
- b = other[j];
- }
- } else {
- b = other[j++];
- }
- // simplest of all the routines
- // sort the values, discarding identicals!
- while (true) {
- if (a < b) {
- buffer[k++] = a;
- a = list[i++];
- } else if (b < a) {
- buffer[k++] = b;
- b = other[j++];
- } else if (a != HIGH) { // at this point, a == b
- // discard both values!
- a = list[i++];
- b = other[j++];
- } else { // DONE!
- buffer[k++] = HIGH;
- len = k;
- break;
- }
- }
- // swap list and buffer
- int[] temp = list;
- list = buffer;
- buffer = temp;
- pat = null;
- return this;
- }
-
- // polarity = 0 is normal: x union y
- // polarity = 2: x union ~y
- // polarity = 1: ~x union y
- // polarity = 3: ~x union ~y
-
- private UnicodeSet add(int[] other, int otherLen, int polarity) {
- ensureBufferCapacity(len + otherLen);
- int i = 0, j = 0, k = 0;
- int a = list[i++];
- int b = other[j++];
- // change from xor is that we have to check overlapping pairs
- // polarity bit 1 means a is second, bit 2 means b is.
- main:
- while (true) {
- switch (polarity) {
- case 0: // both first; take lower if unequal
- if (a < b) { // take a
- // Back up over overlapping ranges in buffer[]
- if (k > 0 && a <= buffer[k-1]) {
- // Pick latter end value in buffer[] vs. list[]
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
- }
- i++; // Common if/else code factored out
- polarity ^= 1;
- } else if (b < a) { // take b
- if (k > 0 && b <= buffer[k-1]) {
- b = max(other[j], buffer[--k]);
- } else {
- buffer[k++] = b;
- b = other[j];
- }
- j++;
- polarity ^= 2;
- } else { // a == b, take a, drop b
- if (a == HIGH) break main;
- // This is symmetrical; it doesn't matter if
- // we backtrack with a or b. - liu
- if (k > 0 && a <= buffer[k-1]) {
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
- }
- i++;
- polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take higher if unequal, and drop other
- if (b <= a) { // take a
- if (a == HIGH) break main;
- buffer[k++] = a;
- } else { // take b
- if (b == HIGH) break main;
- buffer[k++] = b;
- }
- a = list[i++]; polarity ^= 1; // factored common code
- b = other[j++]; polarity ^= 2;
- break;
- case 1: // a second, b first; if b < a, overlap
- if (a < b) { // no overlap, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- }
- }
- buffer[k++] = HIGH; // terminate
- len = k;
- // swap list and buffer
- int[] temp = list;
- list = buffer;
- buffer = temp;
- pat = null;
- return this;
- }
-
- // polarity = 0 is normal: x intersect y
- // polarity = 2: x intersect ~y == set-minus
- // polarity = 1: ~x intersect y
- // polarity = 3: ~x intersect ~y
-
- private UnicodeSet retain(int[] other, int otherLen, int polarity) {
- ensureBufferCapacity(len + otherLen);
- int i = 0, j = 0, k = 0;
- int a = list[i++];
- int b = other[j++];
- // change from xor is that we have to check overlapping pairs
- // polarity bit 1 means a is second, bit 2 means b is.
- main:
- while (true) {
- switch (polarity) {
- case 0: // both first; drop the smaller
- if (a < b) { // drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take lower if unequal
- if (a < b) { // take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 1: // a second, b first;
- if (a < b) { // NO OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, drop b
- b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- }
- }
- buffer[k++] = HIGH; // terminate
- len = k;
- // swap list and buffer
- int[] temp = list;
- list = buffer;
- buffer = temp;
- pat = null;
- return this;
- }
-
- private static final int max(int a, int b) {
- return (a > b) ? a : b;
- }
-
- //----------------------------------------------------------------
- // Generic filter-based scanning code
- //----------------------------------------------------------------
-
- private static interface Filter {
- boolean contains(int codePoint);
- }
-
- private static class NumericValueFilter implements Filter {
- double value;
- NumericValueFilter(double value) { this.value = value; }
- public boolean contains(int ch) {
- return UCharacter.getUnicodeNumericValue(ch) == value;
- }
- }
-
- private static class GeneralCategoryMaskFilter implements Filter {
- int mask;
- GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
- public boolean contains(int ch) {
- return ((1 << UCharacter.getType(ch)) & mask) != 0;
- }
- }
-
- private static class IntPropertyFilter implements Filter {
- int prop;
- int value;
- IntPropertyFilter(int prop, int value) {
- this.prop = prop;
- this.value = value;
- }
- public boolean contains(int ch) {
- return UCharacter.getIntPropertyValue(ch, prop) == value;
- }
- }
-
- // VersionInfo for unassigned characters
- static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
-
- private static class VersionFilter implements Filter {
- VersionInfo version;
- VersionFilter(VersionInfo version) { this.version = version; }
- public boolean contains(int ch) {
- VersionInfo v = UCharacter.getAge(ch);
- // Reference comparison ok; VersionInfo caches and reuses
- // unique objects.
- return v != NO_VERSION &&
- v.compareTo(version) <= 0;
- }
- }
-
- private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
- }
- if(INCLUSIONS[src] == null) {
- UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_CHAR:
- UCharacterProperty.getInstance().addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
- UCharacterProperty.getInstance().addPropertyStarts(incl);
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_HST:
- UCharacterProperty.getInstance().uhst_addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_NORM:
- NormalizerImpl.addPropertyStarts(incl);
- break;
- case UCharacterProperty.SRC_CASE:
- try {
- UCaseProps.getSingleton().addPropertyStarts(incl);
- } catch(IOException e) {
- throw new MissingResourceException(e.getMessage(),"","");
- }
- break;
- case UCharacterProperty.SRC_BIDI:
- try {
- UBiDiProps.getSingleton().addPropertyStarts(incl);
- } catch(IOException e) {
- throw new MissingResourceException(e.getMessage(),"","");
- }
- break;
- default:
- throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
- }
- INCLUSIONS[src] = incl;
- }
- return INCLUSIONS[src];
- }
-
- /**
- * Generic filter-based scanning code for UCD property UnicodeSets.
- */
- private UnicodeSet applyFilter(Filter filter, int src) {
- // Walk through all Unicode characters, noting the start
- // and end of each range for which filter.contain(c) is
- // true. Add each range to a set.
- //
- // To improve performance, use the INCLUSIONS set, which
- // encodes information about character ranges that are known
- // to have identical properties, such as the CJK Ideographs
- // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
- // except the first characters of such ranges.
- //
- // TODO Where possible, instead of scanning over code points,
- // use internal property data to initialize UnicodeSets for
- // those properties. Scanning code points is slow.
-
- clear();
-
- int startHasProperty = -1;
- UnicodeSet inclusions = getInclusions(src);
- int limitRange = inclusions.getRangeCount();
-
- for (int j=0; jUnicodeSet
supports two APIs. The first is the
+ * operand API that allows the caller to modify the value of
+ * a UnicodeSet
object. It conforms to Java 2's
+ * java.util.Set
interface, although
+ * UnicodeSet
does not actually implement that
+ * interface. All methods of Set
are supported, with the
+ * modification that they take a character range or single character
+ * instead of an Object
, and they take a
+ * UnicodeSet
instead of a Collection
. The
+ * operand API may be thought of in terms of boolean logic: a boolean
+ * OR is implemented by add
, a boolean AND is implemented
+ * by retain
, a boolean XOR is implemented by
+ * complement
taking an argument, and a boolean NOT is
+ * implemented by complement
with no argument. In terms
+ * of traditional set theory function names, add
is a
+ * union, retain
is an intersection, remove
+ * is an asymmetric difference, and complement
with no
+ * argument is a set complement with respect to the superset range
+ * MIN_VALUE-MAX_VALUE
+ *
+ * applyPattern()
/toPattern()
API from the
+ * java.text.Format
-derived classes. Unlike the
+ * methods that add characters, add categories, and control the logic
+ * of the set, the method applyPattern()
sets all
+ * attributes of a UnicodeSet
at once, based on a
+ * string pattern.
+ *
+ * applyPattern()
methods and returned by the
+ * toPattern()
method. These patterns follow a syntax
+ * similar to that employed by version 8 regular expression character
+ * classes. Here are some simple examples:
+ *
+ *
+ *
+ *
+ * Any character may be preceded by a backslash in order to remove any special
+ * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
+ * ignored, unless they are escaped.
+ *
+ *
+ *
+ *
+ *
+ * []
No characters
+ *
+ *
+ * [a]
The character 'a'
+ *
+ *
+ *
+ * [ae]
The characters 'a' and 'e'
+ *
+ *
+ *
+ * [a-e]
The characters 'a' through 'e' inclusive, in Unicode code
+ * point order
+ *
+ *
+ *
+ * [\\u4E01]
The character U+4E01
+ *
+ *
+ *
+ * [a{ab}{ac}]
The character 'a' and the multicharacter strings "ab" and
+ * "ac"
+ *
+ *
+ *
+ * [\p{Lu}]
All characters in the general category Uppercase Letter
+ *
+ *
+ *
+ * [a]
The set containing 'a'
+ * [a-z]
The set containing 'a'
+ * through 'z' and all letters in between, in Unicode order
+ * [^a-z]
The set containing
+ * all characters but 'a' through 'z',
+ * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
+ * [[pat1][pat2]]
+ * The union of sets specified by pat1 and pat2
+ * [[pat1]&[pat2]]
+ * The intersection of sets specified by pat1 and pat2
+ * [[pat1]-[pat2]]
+ * The asymmetric difference of sets specified by pat1 and
+ * pat2
+ * [:Lu:] or \p{Lu}
+ * The set of characters having the specified
+ * Unicode property; in
+ * this case, Unicode uppercase letters
+ * [:^Lu:] or \P{Lu}
+ * The set of characters not having the given
+ * Unicode property
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * pattern :=
+ * ('[' '^'? item* ']') |
+ * property
+ *
+ *
+ * item :=
+ * char | (char '-' char) | pattern-expr
+ *
+ *
+ *
+ * pattern-expr :=
+ * pattern | pattern-expr pattern |
+ * pattern-expr op pattern
+ *
+ *
+ *
+ * op :=
+ * '&' | '-'
+ *
+ *
+ *
+ * special :=
+ * '[' | ']' | '-'
+ *
+ *
+ *
+ * char :=
any character that is not
+ * special
any character
+ * | ('\\' )
+ * | ('\u' hex hex hex hex)
+ *
+ *
+ *
+ * hex :=
any character for which
+ *
+ * Character.digit(c, 16)
+ * returns a non-negative result
+ *
+ *
+ * property :=
a Unicode property set pattern
+ *
+ *
+ *
+ *
+ *
+ * Legend:
+ *
+ *
+ *
+ *
+ *
+ * a := b
+ *
+ * a
may be replaced by b
+ *
+ *
+ * a?
+ * zero or one instance of
+ * a
+ *
+ *
+ *
+ * a*
+ * one or more instances of
+ * a
+ *
+ *
+ *
+ * a | b
+ * either
+ * a
or b
+ *
+ *
+ *
+ * 'a'
+ * the literal string between the quotes
+ * end >
+ * start
then an empty set is created.
+ *
+ * @param start first character, inclusive, of range
+ * @param end last character, inclusive, of range
+ * @stable ICU 2.0
+ */
+ public UnicodeSet(int start, int end) {
+ this();
+ complement(start, end);
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language. Whitespace is ignored.
+ * @param pattern a string specifying what characters are in the set
+ * @exception java.lang.IllegalArgumentException if the pattern contains
+ * a syntax error.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet(String pattern) {
+ this();
+ applyPattern(pattern, null, null, IGNORE_SPACE);
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreWhitespace if true, ignore characters for which
+ * UCharacterProperty.isRuleWhiteSpace() returns true
+ * @exception java.lang.IllegalArgumentException if the pattern contains
+ * a syntax error.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet(String pattern, boolean ignoreWhitespace) {
+ this();
+ applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param options a bitmask indicating which options to apply.
+ * Valid options are IGNORE_SPACE and CASE.
+ * @exception java.lang.IllegalArgumentException if the pattern contains
+ * a syntax error.
+ * @stable ICU 3.8
+ */
+ public UnicodeSet(String pattern, int options) {
+ this();
+ applyPattern(pattern, null, null, options);
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param pos on input, the position in pattern at which to start parsing.
+ * On output, the position after the last character parsed.
+ * @param symbols a symbol table mapping variables to char[] arrays
+ * and chars to UnicodeSets
+ * @exception java.lang.IllegalArgumentException if the pattern
+ * contains a syntax error.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols) {
+ this();
+ applyPattern(pattern, pos, symbols, IGNORE_SPACE);
+ }
+
+ /**
+ * Constructs a set from the given pattern. See the class description
+ * for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param pos on input, the position in pattern at which to start parsing.
+ * On output, the position after the last character parsed.
+ * @param symbols a symbol table mapping variables to char[] arrays
+ * and chars to UnicodeSets
+ * @param options a bitmask indicating which options to apply.
+ * Valid options are IGNORE_SPACE and CASE.
+ * @exception java.lang.IllegalArgumentException if the pattern
+ * contains a syntax error.
+ * @stable ICU 3.2
+ */
+ public UnicodeSet(String pattern, ParsePosition pos, SymbolTable symbols, int options) {
+ this();
+ applyPattern(pattern, pos, symbols, options);
+ }
+
+
+ /**
+ * Return a new set that is equivalent to this one.
+ * @stable ICU 2.0
+ */
+ public Object clone() {
+ UnicodeSet result = new UnicodeSet(this);
+ result.frozen = this.frozen;
+ return result;
+ }
+
+ /**
+ * Make this object represent the range start - end
.
+ * If end > start
then this object is set to an
+ * an empty range.
+ *
+ * @param start first character in the set, inclusive
+ * @param end last character in the set, inclusive
+ * @stable ICU 2.0
+ */
+ public UnicodeSet set(int start, int end) {
+ checkFrozen();
+ clear();
+ complement(start, end);
+ return this;
+ }
+
+ /**
+ * Make this object represent the same set as other
.
+ * @param other a UnicodeSet
whose value will be
+ * copied to this object
+ * @stable ICU 2.0
+ */
+ public UnicodeSet set(UnicodeSet other) {
+ checkFrozen();
+ list = (int[]) other.list.clone();
+ len = other.len;
+ pat = other.pat;
+ strings = (TreeSet)other.strings.clone();
+ return this;
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given pattern.
+ * See the class description for the syntax of the pattern language.
+ * Whitespace is ignored.
+ * @param pattern a string specifying what characters are in the set
+ * @exception java.lang.IllegalArgumentException if the pattern
+ * contains a syntax error.
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet applyPattern(String pattern) {
+ checkFrozen();
+ return applyPattern(pattern, null, null, IGNORE_SPACE);
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given pattern,
+ * optionally ignoring whitespace.
+ * See the class description for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param ignoreWhitespace if true then characters for which
+ * UCharacterProperty.isRuleWhiteSpace() returns true are ignored
+ * @exception java.lang.IllegalArgumentException if the pattern
+ * contains a syntax error.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet applyPattern(String pattern, boolean ignoreWhitespace) {
+ checkFrozen();
+ return applyPattern(pattern, null, null, ignoreWhitespace ? IGNORE_SPACE : 0);
+ }
+
+ /**
+ * Modifies this set to represent the set specified by the given pattern,
+ * optionally ignoring whitespace.
+ * See the class description for the syntax of the pattern language.
+ * @param pattern a string specifying what characters are in the set
+ * @param options a bitmask indicating which options to apply.
+ * Valid options are IGNORE_SPACE and CASE.
+ * @exception java.lang.IllegalArgumentException if the pattern
+ * contains a syntax error.
+ * @stable ICU 3.8
+ */
+ public UnicodeSet applyPattern(String pattern, int options) {
+ checkFrozen();
+ return applyPattern(pattern, null, null, options);
+ }
+
+ /**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a UnicodeSet pattern.
+ * @stable ICU 2.0
+ */
+ public static boolean resemblesPattern(String pattern, int pos) {
+ return ((pos+1) < pattern.length() &&
+ pattern.charAt(pos) == '[') ||
+ resemblesPropertyPattern(pattern, pos);
+ }
+
+ /**
+ * Append the toPattern()
representation of a
+ * string to the given StringBuffer
.
+ */
+ private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
+ for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
+ _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
+ }
+ }
+
+ /**
+ * Append the toPattern()
representation of a
+ * character to the given StringBuffer
.
+ */
+ private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
+ if (escapeUnprintable && Utility.isUnprintable(c)) {
+ // Use hex escape notation (charAt()
.
+ * @return an index from 0..size()-1, or -1
+ * @stable ICU 2.0
+ */
+ public int indexOf(int c) {
+ if (c < MIN_VALUE || c > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+ }
+ int i = 0;
+ int n = 0;
+ for (;;) {
+ int start = list[i++];
+ if (c < start) {
+ return -1;
+ }
+ int limit = list[i++];
+ if (c < limit) {
+ return n + c - start;
+ }
+ n += limit - start;
+ }
+ }
+
+ /**
+ * Returns the character at the given index within this set, where
+ * the set is ordered by ascending code point. If the index is
+ * out of range, return -1. The inverse of this method is
+ * indexOf()
.
+ * @param index an index from 0..size()-1
+ * @return the character at the given index, or -1.
+ * @stable ICU 2.0
+ */
+ public int charAt(int index) {
+ if (index >= 0) {
+ // len2 is the largest even integer <= len, that is, it is len
+ // for even values and len-1 for odd values. With odd values
+ // the last entry is UNICODESET_HIGH.
+ int len2 = len & ~1;
+ for (int i=0; i < len2;) {
+ int start = list[i++];
+ int count = list[i++] - start;
+ if (index < count) {
+ return start + index;
+ }
+ index -= count;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Adds the specified range to this set if it is not already
+ * present. If this set already contains the specified range,
+ * the call leaves this set unchanged. If end > start
+ * then an empty range is added, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be added
+ * to this set.
+ * @param end last character, inclusive, of range to be added
+ * to this set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet add(int start, int end) {
+ checkFrozen();
+ return add_unchecked(start, end);
+ }
+
+ // for internal use, after checkFrozen has been called
+ private UnicodeSet add_unchecked(int start, int end) {
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ if (start < end) {
+ add(range(start, end), 2, 0);
+ } else if (start == end) {
+ add(start);
+ }
+ return this;
+ }
+
+// /**
+// * Format out the inversion list as a string, for debugging. Uncomment when
+// * needed.
+// */
+// public final String dump() {
+// StringBuffer buf = new StringBuffer("[");
+// for (int i=0; i
Warning: you cannot add an empty string ("") to a UnicodeSet.
+ * @param s the source string
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet add(String s) {
+ checkFrozen();
+ int cp = getSingleCP(s);
+ if (cp < 0) {
+ strings.add(s);
+ pat = null;
+ } else {
+ add_unchecked(cp, cp);
+ }
+ return this;
+ }
+
+ /**
+ * @return a code point IF the string consists of a single one.
+ * otherwise returns -1.
+ * @param string to test
+ */
+ private static int getSingleCP(String s) {
+ if (s.length() < 1) {
+ throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
+ }
+ if (s.length() > 2) return -1;
+ if (s.length() == 1) return s.charAt(0);
+
+ // at this point, len = 2
+ int cp = UTF16.charAt(s, 0);
+ if (cp > 0xFFFF) { // is surrogate pair
+ return cp;
+ }
+ return -1;
+ }
+
+ /**
+ * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param s the source string
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet addAll(String s) {
+ checkFrozen();
+ int cp;
+ for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+ cp = UTF16.charAt(s, i);
+ add_unchecked(cp, cp);
+ }
+ return this;
+ }
+
+ /**
+ * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param s the source string
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet retainAll(String s) {
+ return retainAll(fromAll(s));
+ }
+
+ /**
+ * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param s the source string
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet complementAll(String s) {
+ return complementAll(fromAll(s));
+ }
+
+ /**
+ * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
+ * If this set already any particular character, it has no effect on that character.
+ * @param s the source string
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet removeAll(String s) {
+ return removeAll(fromAll(s));
+ }
+
+ /**
+ * Remove all strings from this UnicodeSet
+ * @return this object, for chaining
+ * @draft ICU 4.2
+ * @provisional This API might change or be removed in a future release.
+ */
+ public final UnicodeSet removeAllStrings() {
+ checkFrozen();
+ if (strings.size() != 0) {
+ strings.clear();
+ pat = null;
+ }
+ return this;
+ }
+
+ /**
+ * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
+ *
Warning: you cannot add an empty string ("") to a UnicodeSet.
+ * @param s the source string
+ * @return a newly created set containing the given string
+ * @stable ICU 2.0
+ */
+ public static UnicodeSet from(String s) {
+ return new UnicodeSet().add(s);
+ }
+
+
+ /**
+ * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
+ * @param s the source string
+ * @return a newly created set containing the given characters
+ * @stable ICU 2.0
+ */
+ public static UnicodeSet fromAll(String s) {
+ return new UnicodeSet().addAll(s);
+ }
+
+
+ /**
+ * Retain only the elements in this set that are contained in the
+ * specified range. If end > start
then an empty range is
+ * retained, leaving the set empty.
+ *
+ * @param start first character, inclusive, of range to be retained
+ * to this set.
+ * @param end last character, inclusive, of range to be retained
+ * to this set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet retain(int start, int end) {
+ checkFrozen();
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ if (start <= end) {
+ retain(range(start, end), 2, 0);
+ } else {
+ clear();
+ }
+ return this;
+ }
+
+ /**
+ * Retain the specified character from this set if it is present.
+ * Upon return this set will be empty if it did not contain c, or
+ * will only contain c if it did contain c.
+ * @param c the character to be retained
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet retain(int c) {
+ return retain(c, c);
+ }
+
+ /**
+ * Retain the specified string in this set if it is present.
+ * Upon return this set will be empty if it did not contain s, or
+ * will only contain s if it did contain s.
+ * @param s the string to be retained
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet retain(String s) {
+ int cp = getSingleCP(s);
+ if (cp < 0) {
+ boolean isIn = strings.contains(s);
+ if (isIn && size() == 1) {
+ return this;
+ }
+ clear();
+ strings.add(s);
+ pat = null;
+ } else {
+ retain(cp, cp);
+ }
+ return this;
+ }
+
+ /**
+ * Removes the specified range from this set if it is present.
+ * The set will not contain the specified range once the call
+ * returns. If end > start
then an empty range is
+ * removed, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be removed
+ * from this set.
+ * @param end last character, inclusive, of range to be removed
+ * from this set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet remove(int start, int end) {
+ checkFrozen();
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ if (start <= end) {
+ retain(range(start, end), 2, 2);
+ }
+ return this;
+ }
+
+ /**
+ * Removes the specified character from this set if it is present.
+ * The set will not contain the specified character once the call
+ * returns.
+ * @param c the character to be removed
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet remove(int c) {
+ return remove(c, c);
+ }
+
+ /**
+ * Removes the specified string from this set if it is present.
+ * The set will not contain the specified string once the call
+ * returns.
+ * @param s the string to be removed
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet remove(String s) {
+ int cp = getSingleCP(s);
+ if (cp < 0) {
+ strings.remove(s);
+ pat = null;
+ } else {
+ remove(cp, cp);
+ }
+ return this;
+ }
+
+ /**
+ * Complements the specified range in this set. Any character in
+ * the range will be removed if it is in this set, or will be
+ * added if it is not in this set. If end > start
+ * then an empty range is complemented, leaving the set unchanged.
+ *
+ * @param start first character, inclusive, of range to be removed
+ * from this set.
+ * @param end last character, inclusive, of range to be removed
+ * from this set.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet complement(int start, int end) {
+ checkFrozen();
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ if (start <= end) {
+ xor(range(start, end), 2, 0);
+ }
+ pat = null;
+ return this;
+ }
+
+ /**
+ * Complements the specified character in this set. The character
+ * will be removed if it is in this set, or will be added if it is
+ * not in this set.
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet complement(int c) {
+ return complement(c, c);
+ }
+
+ /**
+ * This is equivalent to
+ * complement(MIN_VALUE, MAX_VALUE)
.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet complement() {
+ checkFrozen();
+ if (list[0] == LOW) {
+ System.arraycopy(list, 1, list, 0, len-1);
+ --len;
+ } else {
+ ensureCapacity(len+1);
+ System.arraycopy(list, 0, list, 1, len);
+ list[0] = LOW;
+ ++len;
+ }
+ pat = null;
+ return this;
+ }
+
+ /**
+ * Complement the specified string in this set.
+ * The set will not contain the specified string once the call
+ * returns.
+ *
Warning: you cannot add an empty string ("") to a UnicodeSet.
+ * @param s the string to complement
+ * @return this object, for chaining
+ * @stable ICU 2.0
+ */
+ public final UnicodeSet complement(String s) {
+ checkFrozen();
+ int cp = getSingleCP(s);
+ if (cp < 0) {
+ if (strings.contains(s)) strings.remove(s);
+ else strings.add(s);
+ pat = null;
+ } else {
+ complement(cp, cp);
+ }
+ return this;
+ }
+
+ /**
+ * Returns true if this set contains the given character.
+ * @param c character to be checked for containment
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean contains(int c) {
+ if (c < MIN_VALUE || c > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
+ }
+
+ /*
+ // Set i to the index of the start item greater than ch
+ // We know we will terminate without length test!
+ int i = -1;
+ while (true) {
+ if (c < list[++i]) break;
+ }
+ */
+
+ int i = findCodePoint(c);
+
+ return ((i & 1) != 0); // return true if odd
+ }
+
+ /**
+ * Returns the smallest value i such that c < list[i]. Caller
+ * must ensure that c is a legal value or this method will enter
+ * an infinite loop. This method performs a binary search.
+ * @param c a character in the range MIN_VALUE..MAX_VALUE
+ * inclusive
+ * @return the smallest integer i in the range 0..len-1,
+ * inclusive, such that c < list[i]
+ */
+ private final int findCodePoint(int c) {
+ /* Examples:
+ findCodePoint(c)
+ set list[] c=0 1 3 4 7 8
+ === ============== ===========
+ [] [110000] 0 0 0 0 0 0
+ [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
+ [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
+ [:all:] [0, 110000] 1 1 1 1 1 1
+ */
+
+ // Return the smallest i such that c < list[i]. Assume
+ // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
+ if (c < list[0]) return 0;
+ // High runner test. c is often after the last range, so an
+ // initial check for this condition pays off.
+ if (len >= 2 && c >= list[len-2]) return len-1;
+ int lo = 0;
+ int hi = len - 1;
+ // invariant: c >= list[lo]
+ // invariant: c < list[hi]
+ for (;;) {
+ int i = (lo + hi) >>> 1;
+ if (i == lo) return hi;
+ if (c < list[i]) {
+ hi = i;
+ } else {
+ lo = i;
+ }
+ }
+ }
+
+// //----------------------------------------------------------------
+// // Unrolled binary search
+// //----------------------------------------------------------------
+//
+// private int validLen = -1; // validated value of len
+// private int topOfLow;
+// private int topOfHigh;
+// private int power;
+// private int deltaStart;
+//
+// private void validate() {
+// if (len <= 1) {
+// throw new IllegalArgumentException("list.len==" + len + "; must be >1");
+// }
+//
+// // find greatest power of 2 less than or equal to len
+// for (power = exp2.length-1; power > 0 && exp2[power] > len; power--) {}
+//
+// // assert(exp2[power] <= len);
+//
+// // determine the starting points
+// topOfLow = exp2[power] - 1;
+// topOfHigh = len - 1;
+// deltaStart = exp2[power-1];
+// validLen = len;
+// }
+//
+// private static final int exp2[] = {
+// 0x1, 0x2, 0x4, 0x8,
+// 0x10, 0x20, 0x40, 0x80,
+// 0x100, 0x200, 0x400, 0x800,
+// 0x1000, 0x2000, 0x4000, 0x8000,
+// 0x10000, 0x20000, 0x40000, 0x80000,
+// 0x100000, 0x200000, 0x400000, 0x800000,
+// 0x1000000, 0x2000000, 0x4000000, 0x8000000,
+// 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java
+// };
+//
+// /**
+// * Unrolled lowest index GT.
+// */
+// private final int leastIndexGT(int searchValue) {
+//
+// if (len != validLen) {
+// if (len == 1) return 0;
+// validate();
+// }
+// int temp;
+//
+// // set up initial range to search. Each subrange is a power of two in length
+// int high = searchValue < list[topOfLow] ? topOfLow : topOfHigh;
+//
+// // Completely unrolled binary search, folhighing "Programming Pearls"
+// // Each case deliberately falls through to the next
+// // Logically, list[-1] < all_search_values && list[count] > all_search_values
+// // although the values -1 and count are never actually touched.
+//
+// // The bounds at each point are low & high,
+// // where low == high - delta*2
+// // so high - delta is the midpoint
+//
+// // The invariant AFTER each line is that list[low] < searchValue <= list[high]
+//
+// switch (power) {
+// //case 31: if (searchValue < list[temp = high-0x40000000]) high = temp; // no unsigned int in Java
+// case 30: if (searchValue < list[temp = high-0x20000000]) high = temp;
+// case 29: if (searchValue < list[temp = high-0x10000000]) high = temp;
+//
+// case 28: if (searchValue < list[temp = high- 0x8000000]) high = temp;
+// case 27: if (searchValue < list[temp = high- 0x4000000]) high = temp;
+// case 26: if (searchValue < list[temp = high- 0x2000000]) high = temp;
+// case 25: if (searchValue < list[temp = high- 0x1000000]) high = temp;
+//
+// case 24: if (searchValue < list[temp = high- 0x800000]) high = temp;
+// case 23: if (searchValue < list[temp = high- 0x400000]) high = temp;
+// case 22: if (searchValue < list[temp = high- 0x200000]) high = temp;
+// case 21: if (searchValue < list[temp = high- 0x100000]) high = temp;
+//
+// case 20: if (searchValue < list[temp = high- 0x80000]) high = temp;
+// case 19: if (searchValue < list[temp = high- 0x40000]) high = temp;
+// case 18: if (searchValue < list[temp = high- 0x20000]) high = temp;
+// case 17: if (searchValue < list[temp = high- 0x10000]) high = temp;
+//
+// case 16: if (searchValue < list[temp = high- 0x8000]) high = temp;
+// case 15: if (searchValue < list[temp = high- 0x4000]) high = temp;
+// case 14: if (searchValue < list[temp = high- 0x2000]) high = temp;
+// case 13: if (searchValue < list[temp = high- 0x1000]) high = temp;
+//
+// case 12: if (searchValue < list[temp = high- 0x800]) high = temp;
+// case 11: if (searchValue < list[temp = high- 0x400]) high = temp;
+// case 10: if (searchValue < list[temp = high- 0x200]) high = temp;
+// case 9: if (searchValue < list[temp = high- 0x100]) high = temp;
+//
+// case 8: if (searchValue < list[temp = high- 0x80]) high = temp;
+// case 7: if (searchValue < list[temp = high- 0x40]) high = temp;
+// case 6: if (searchValue < list[temp = high- 0x20]) high = temp;
+// case 5: if (searchValue < list[temp = high- 0x10]) high = temp;
+//
+// case 4: if (searchValue < list[temp = high- 0x8]) high = temp;
+// case 3: if (searchValue < list[temp = high- 0x4]) high = temp;
+// case 2: if (searchValue < list[temp = high- 0x2]) high = temp;
+// case 1: if (searchValue < list[temp = high- 0x1]) high = temp;
+// }
+//
+// return high;
+// }
+//
+// // For debugging only
+// public int len() {
+// return len;
+// }
+//
+// //----------------------------------------------------------------
+// //----------------------------------------------------------------
+
+ /**
+ * Returns true if this set contains every character
+ * of the given range.
+ * @param start first character, inclusive, of the range
+ * @param end last character, inclusive, of the range
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean contains(int start, int end) {
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ //int i = -1;
+ //while (true) {
+ // if (start < list[++i]) break;
+ //}
+ int i = findCodePoint(start);
+ return ((i & 1) != 0 && end < list[i]);
+ }
+
+ /**
+ * Returns true if this set contains the given
+ * multicharacter string.
+ * @param s string to be checked for containment
+ * @return true if this set contains the specified string
+ * @stable ICU 2.0
+ */
+ public final boolean contains(String s) {
+
+ int cp = getSingleCP(s);
+ if (cp < 0) {
+ return strings.contains(s);
+ } else {
+ return contains(cp);
+ }
+ }
+
+ /**
+ * Returns true if this set contains all the characters and strings
+ * of the given set.
+ * @param b set to be checked for containment
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean containsAll(UnicodeSet b) {
+ // The specified set is a subset if all of its pairs are contained in
+ // this set. This implementation accesses the lists directly for speed.
+ // TODO: this could be faster if size() were cached. But that would affect building speed
+ // so it needs investigation.
+ int[] listB = b.list;
+ boolean needA = true;
+ boolean needB = true;
+ int aPtr = 0;
+ int bPtr = 0;
+ int aLen = len - 1;
+ int bLen = b.len - 1;
+ int startA = 0, startB = 0, limitA = 0, limitB = 0;
+ while (true) {
+ // double iterations are such a pain...
+ if (needA) {
+ if (aPtr >= aLen) {
+ // ran out of A. If B is also exhausted, then break;
+ if (needB && bPtr >= bLen) {
+ break;
+ }
+ return false;
+ }
+ startA = list[aPtr++];
+ limitA = list[aPtr++];
+ }
+ if (needB) {
+ if (bPtr >= bLen) {
+ // ran out of B. Since we got this far, we have an A and we are ok so far
+ break;
+ }
+ startB = listB[bPtr++];
+ limitB = listB[bPtr++];
+ }
+ // if B doesn't overlap and is greater than A, get new A
+ if (startB >= limitA) {
+ needA = true;
+ needB = false;
+ continue;
+ }
+ // if B is wholy contained in A, then get a new B
+ if (startB >= startA && limitB <= limitA) {
+ needA = false;
+ needB = true;
+ continue;
+ }
+ // all other combinations mean we fail
+ return false;
+ }
+
+ if (!strings.containsAll(b.strings)) return false;
+ return true;
+ }
+
+// /**
+// * Returns true if this set contains all the characters and strings
+// * of the given set.
+// * @param c set to be checked for containment
+// * @return true if the test condition is met
+// * @stable ICU 2.0
+// */
+// public boolean containsAllOld(UnicodeSet c) {
+// // The specified set is a subset if all of its pairs are contained in
+// // this set. It's possible to code this more efficiently in terms of
+// // direct manipulation of the inversion lists if the need arises.
+// int n = c.getRangeCount();
+// for (int i=0; i
+ * containsAll is false for each of: "acb", "bcda", "bcx"
+ * @param s string containing characters to be checked for containment
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean containsAll(String s) {
+ int cp;
+ for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
+ cp = UTF16.charAt(s, i);
+ if (!contains(cp)) {
+ if (strings.size() == 0) {
+ return false;
+ }
+ return containsAll(s, 0);
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Recursive routine called if we fail to find a match in containsAll, and there are strings
+ * @param s source string
+ * @param i point to match to the end on
+ * @return true if ok
+ */
+ private boolean containsAll(String s, int i) {
+ if (i >= s.length()) {
+ return true;
+ }
+ int cp= UTF16.charAt(s, i);
+ if (contains(cp) && containsAll(s, i+UTF16.getCharCount(cp))) {
+ return true;
+ }
+
+ Iterator it = strings.iterator();
+ while (it.hasNext()) {
+ String setStr = (String)it.next();
+ if (s.startsWith(setStr, i) && containsAll(s, i+setStr.length())) {
+ return true;
+ }
+ }
+ return false;
+
+ }
+
+ /**
+ * Get the Regex equivalent for this UnicodeSet
+ * @return regex pattern equivalent to this UnicodeSet
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ public String getRegexEquivalent() {
+ if (strings.size() == 0) return toString();
+ StringBuffer result = new StringBuffer("(?:");
+ _generatePattern(result, true, false);
+ Iterator it = strings.iterator();
+ while (it.hasNext()) {
+ result.append('|');
+ _appendToPat(result, (String) it.next(), true);
+ }
+ return result.append(")").toString();
+ }
+
+ /**
+ * Returns true if this set contains none of the characters
+ * of the given range.
+ * @param start first character, inclusive, of the range
+ * @param end last character, inclusive, of the range
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean containsNone(int start, int end) {
+ if (start < MIN_VALUE || start > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
+ }
+ if (end < MIN_VALUE || end > MAX_VALUE) {
+ throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
+ }
+ int i = -1;
+ while (true) {
+ if (start < list[++i]) break;
+ }
+ return ((i & 1) == 0 && end < list[i]);
+ }
+
+ /**
+ * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
+ * For example, for the Unicode set [a{bc}{cd}]
+ * containsNone is true for: "xy", "cb"
+ * containsNone is false for: "a", "bc", "bcd"
+ * @param b set to be checked for containment
+ * @return true if the test condition is met
+ * @stable ICU 2.0
+ */
+ public boolean containsNone(UnicodeSet b) {
+ // The specified set is a subset if some of its pairs overlap with some of this set's pairs.
+ // This implementation accesses the lists directly for speed.
+ int[] listB = b.list;
+ boolean needA = true;
+ boolean needB = true;
+ int aPtr = 0;
+ int bPtr = 0;
+ int aLen = len - 1;
+ int bLen = b.len - 1;
+ int startA = 0, startB = 0, limitA = 0, limitB = 0;
+ while (true) {
+ // double iterations are such a pain...
+ if (needA) {
+ if (aPtr >= aLen) {
+ // ran out of A: break so we test strings
+ break;
+ }
+ startA = list[aPtr++];
+ limitA = list[aPtr++];
+ }
+ if (needB) {
+ if (bPtr >= bLen) {
+ // ran out of B: break so we test strings
+ break;
+ }
+ startB = listB[bPtr++];
+ limitB = listB[bPtr++];
+ }
+ // if B is higher than any part of A, get new A
+ if (startB >= limitA) {
+ needA = true;
+ needB = false;
+ continue;
+ }
+ // if A is higher than any part of B, get new B
+ if (startA >= limitB) {
+ needA = false;
+ needB = true;
+ continue;
+ }
+ // all other combinations mean we fail
+ return false;
+ }
+
+ if (!SortedSetRelation.hasRelation(strings, SortedSetRelation.DISJOINT, b.strings)) return false;
+ return true;
+ }
+
+// /**
+// * Returns true if none of the characters or strings in this UnicodeSet appears in the string.
+// * For example, for the Unicode set [a{bc}{cd}]
+// * containsNone is true for: "xy", "cb"
+// * containsNone is false for: "a", "bc", "bcd"
+// * @param c set to be checked for containment
+// * @return true if the test condition is met
+// * @stable ICU 2.0
+// */
+// public boolean containsNoneOld(UnicodeSet c) {
+// // The specified set is a subset if all of its pairs are contained in
+// // this set. It's possible to code this more efficiently in terms of
+// // direct manipulation of the inversion lists if the need arises.
+// int n = c.getRangeCount();
+// for (int i=0; i0..getRangeCount()-1
+ * @see #getRangeCount
+ * @see #getRangeEnd
+ * @stable ICU 2.0
+ */
+ public int getRangeStart(int index) {
+ return list[index*2];
+ }
+
+ /**
+ * Iteration method that returns the last character in the
+ * specified range of this set.
+ * @exception ArrayIndexOutOfBoundsException if index is outside
+ * the range 0..getRangeCount()-1
+ * @see #getRangeStart
+ * @see #getRangeEnd
+ * @stable ICU 2.0
+ */
+ public int getRangeEnd(int index) {
+ return (list[index*2 + 1] - 1);
+ }
+
+ /**
+ * Reallocate this objects internal structures to take up the least
+ * possible space, without changing this object's value.
+ * @stable ICU 2.0
+ */
+ public UnicodeSet compact() {
+ checkFrozen();
+ if (len != list.length) {
+ int[] temp = new int[len];
+ System.arraycopy(list, 0, temp, 0, len);
+ list = temp;
+ }
+ rangeList = null;
+ buffer = null;
+ return this;
+ }
+
+ /**
+ * Compares the specified object with this set for equality. Returns
+ * true if the specified object is also a set, the two sets
+ * have the same size, and every member of the specified set is
+ * contained in this set (or equivalently, every member of this set is
+ * contained in the specified set).
+ *
+ * @param o Object to be compared for equality with this set.
+ * @return true if the specified Object is equal to this set.
+ * @stable ICU 2.0
+ */
+ public boolean equals(Object o) {
+ try {
+ UnicodeSet that = (UnicodeSet) o;
+ if (len != that.len) return false;
+ for (int i = 0; i < len; ++i) {
+ if (list[i] != that.list[i]) return false;
+ }
+ if (!strings.equals(that.strings)) return false;
+ } catch (Exception e) {
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * Returns the hash code value for this set.
+ *
+ * @return the hash code value for this set.
+ * @see java.lang.Object#hashCode()
+ * @stable ICU 2.0
+ */
+ public int hashCode() {
+ int result = len;
+ for (int i = 0; i < len; ++i) {
+ result *= 1000003;
+ result += list[i];
+ }
+ return result;
+ }
+
+ /**
+ * Return a programmer-readable string representation of this object.
+ * @stable ICU 2.0
+ */
+ public String toString() {
+ return toPattern(true);
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Pattern parsing
+ //----------------------------------------------------------------
+
+ /**
+ * Parses the given pattern, starting at the given position. The character
+ * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
+ * Parsing continues until the corresponding closing ']'. If a syntax error
+ * is encountered between the opening and closing brace, the parse fails.
+ * Upon return from a successful parse, the ParsePosition is updated to
+ * point to the character following the closing ']', and an inversion
+ * list for the parsed pattern is returned. This method
+ * calls itself recursively to parse embedded subpatterns.
+ *
+ * @param pattern the string containing the pattern to be parsed. The
+ * portion of the string from pos.getIndex(), which must be a '[', to the
+ * corresponding closing ']', is parsed.
+ * @param pos upon entry, the position at which to being parsing. The
+ * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
+ * from a successful parse, pos.getIndex() is either the character after the
+ * closing ']' of the parsed pattern, or pattern.length() if the closing ']'
+ * is the last character of the pattern string.
+ * @return an inversion list for the parsed substring
+ * of pattern
+ * @exception java.lang.IllegalArgumentException if the parse fails.
+ * @internal
+ * @deprecated - for internal use only
+ */
+ public UnicodeSet applyPattern(String pattern,
+ ParsePosition pos,
+ SymbolTable symbols,
+ int options) {
+
+ // Need to build the pattern in a temporary string because
+ // _applyPattern calls add() etc., which set pat to empty.
+ boolean parsePositionWasNull = pos == null;
+ if (parsePositionWasNull) {
+ pos = new ParsePosition(0);
+ }
+
+ StringBuffer rebuiltPat = new StringBuffer();
+ RuleCharacterIterator chars =
+ new RuleCharacterIterator(pattern, symbols, pos);
+ applyPattern(chars, symbols, rebuiltPat, options);
+ if (chars.inVariable()) {
+ syntaxError(chars, "Extra chars in variable value");
+ }
+ pat = rebuiltPat.toString();
+ if (parsePositionWasNull) {
+ int i = pos.getIndex();
+
+ // Skip over trailing whitespace
+ if ((options & IGNORE_SPACE) != 0) {
+ i = Utility.skipWhitespace(pattern, i);
+ }
+
+ if (i != pattern.length()) {
+ throw new IllegalArgumentException("Parse of \"" + pattern +
+ "\" failed at " + i);
+ }
+ }
+ return this;
+ }
+
+ /**
+ * Parse the pattern from the given RuleCharacterIterator. The
+ * iterator is advanced over the parsed pattern.
+ * @param chars iterator over the pattern characters. Upon return
+ * it will be advanced to the first character after the parsed
+ * pattern, or the end of the iteration if all characters are
+ * parsed.
+ * @param symbols symbol table to use to parse and dereference
+ * variables, or null if none.
+ * @param rebuiltPat the pattern that was parsed, rebuilt or
+ * copied from the input pattern, as appropriate.
+ * @param options a bit mask of zero or more of the following:
+ * IGNORE_SPACE, CASE.
+ */
+ void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
+ StringBuffer rebuiltPat, int options) {
+
+ // Syntax characters: [ ] ^ - & { }
+
+ // Recognized special forms for chars, sets: c-c s-s s&s
+
+ int opts = RuleCharacterIterator.PARSE_VARIABLES |
+ RuleCharacterIterator.PARSE_ESCAPES;
+ if ((options & IGNORE_SPACE) != 0) {
+ opts |= RuleCharacterIterator.SKIP_WHITESPACE;
+ }
+
+ StringBuffer patBuf = new StringBuffer(), buf = null;
+ boolean usePat = false;
+ UnicodeSet scratch = null;
+ Object backup = null;
+
+ // mode: 0=before [, 1=between [...], 2=after ]
+ // lastItem: 0=none, 1=char, 2=set
+ int lastItem = 0, lastChar = 0, mode = 0;
+ char op = 0;
+
+ boolean invert = false;
+
+ clear();
+
+ while (mode != 2 && !chars.atEnd()) {
+ if (false) {
+ // Debugging assertion
+ if (!((lastItem == 0 && op == 0) ||
+ (lastItem == 1 && (op == 0 || op == '-')) ||
+ (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
+ throw new IllegalArgumentException();
+ }
+ }
+
+ int c = 0;
+ boolean literal = false;
+ UnicodeSet nested = null;
+
+ // -------- Check for property pattern
+
+ // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
+ int setMode = 0;
+ if (resemblesPropertyPattern(chars, opts)) {
+ setMode = 2;
+ }
+
+ // -------- Parse '[' of opening delimiter OR nested set.
+ // If there is a nested set, use `setMode' to define how
+ // the set should be parsed. If the '[' is part of the
+ // opening delimiter for this pattern, parse special
+ // strings "[", "[^", "[-", and "[^-". Check for stand-in
+ // characters representing a nested set in the symbol
+ // table.
+
+ else {
+ // Prepare to backup if necessary
+ backup = chars.getPos(backup);
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+
+ if (c == '[' && !literal) {
+ if (mode == 1) {
+ chars.setPos(backup); // backup
+ setMode = 1;
+ } else {
+ // Handle opening '[' delimiter
+ mode = 1;
+ patBuf.append('[');
+ backup = chars.getPos(backup); // prepare to backup
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+ if (c == '^' && !literal) {
+ invert = true;
+ patBuf.append('^');
+ backup = chars.getPos(backup); // prepare to backup
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+ }
+ // Fall through to handle special leading '-';
+ // otherwise restart loop for nested [], \p{}, etc.
+ if (c == '-') {
+ literal = true;
+ // Fall through to handle literal '-' below
+ } else {
+ chars.setPos(backup); // backup
+ continue;
+ }
+ }
+ } else if (symbols != null) {
+ UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
+ if (m != null) {
+ try {
+ nested = (UnicodeSet) m;
+ setMode = 3;
+ } catch (ClassCastException e) {
+ syntaxError(chars, "Syntax error");
+ }
+ }
+ }
+ }
+
+ // -------- Handle a nested set. This either is inline in
+ // the pattern or represented by a stand-in that has
+ // previously been parsed and was looked up in the symbol
+ // table.
+
+ if (setMode != 0) {
+ if (lastItem == 1) {
+ if (op != 0) {
+ syntaxError(chars, "Char expected after operator");
+ }
+ add_unchecked(lastChar, lastChar);
+ _appendToPat(patBuf, lastChar, false);
+ lastItem = op = 0;
+ }
+
+ if (op == '-' || op == '&') {
+ patBuf.append(op);
+ }
+
+ if (nested == null) {
+ if (scratch == null) scratch = new UnicodeSet();
+ nested = scratch;
+ }
+ switch (setMode) {
+ case 1:
+ nested.applyPattern(chars, symbols, patBuf, options);
+ break;
+ case 2:
+ chars.skipIgnored(opts);
+ nested.applyPropertyPattern(chars, patBuf, symbols);
+ break;
+ case 3: // `nested' already parsed
+ nested._toPattern(patBuf, false);
+ break;
+ }
+
+ usePat = true;
+
+ if (mode == 0) {
+ // Entire pattern is a category; leave parse loop
+ set(nested);
+ mode = 2;
+ break;
+ }
+
+ switch (op) {
+ case '-':
+ removeAll(nested);
+ break;
+ case '&':
+ retainAll(nested);
+ break;
+ case 0:
+ addAll(nested);
+ break;
+ }
+
+ op = 0;
+ lastItem = 2;
+
+ continue;
+ }
+
+ if (mode == 0) {
+ syntaxError(chars, "Missing '['");
+ }
+
+ // -------- Parse special (syntax) characters. If the
+ // current character is not special, or if it is escaped,
+ // then fall through and handle it below.
+
+ if (!literal) {
+ switch (c) {
+ case ']':
+ if (lastItem == 1) {
+ add_unchecked(lastChar, lastChar);
+ _appendToPat(patBuf, lastChar, false);
+ }
+ // Treat final trailing '-' as a literal
+ if (op == '-') {
+ add_unchecked(op, op);
+ patBuf.append(op);
+ } else if (op == '&') {
+ syntaxError(chars, "Trailing '&'");
+ }
+ patBuf.append(']');
+ mode = 2;
+ continue;
+ case '-':
+ if (op == 0) {
+ if (lastItem != 0) {
+ op = (char) c;
+ continue;
+ } else {
+ // Treat final trailing '-' as a literal
+ add_unchecked(c, c);
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+ if (c == ']' && !literal) {
+ patBuf.append("-]");
+ mode = 2;
+ continue;
+ }
+ }
+ }
+ syntaxError(chars, "'-' not after char or set");
+ case '&':
+ if (lastItem == 2 && op == 0) {
+ op = (char) c;
+ continue;
+ }
+ syntaxError(chars, "'&' not after set");
+ case '^':
+ syntaxError(chars, "'^' not after '['");
+ case '{':
+ if (op != 0) {
+ syntaxError(chars, "Missing operand after operator");
+ }
+ if (lastItem == 1) {
+ add_unchecked(lastChar, lastChar);
+ _appendToPat(patBuf, lastChar, false);
+ }
+ lastItem = 0;
+ if (buf == null) {
+ buf = new StringBuffer();
+ } else {
+ buf.setLength(0);
+ }
+ boolean ok = false;
+ while (!chars.atEnd()) {
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+ if (c == '}' && !literal) {
+ ok = true;
+ break;
+ }
+ UTF16.append(buf, c);
+ }
+ if (buf.length() < 1 || !ok) {
+ syntaxError(chars, "Invalid multicharacter string");
+ }
+ // We have new string. Add it to set and continue;
+ // we don't need to drop through to the further
+ // processing
+ add(buf.toString());
+ patBuf.append('{');
+ _appendToPat(patBuf, buf.toString(), false);
+ patBuf.append('}');
+ continue;
+ case SymbolTable.SYMBOL_REF:
+ // symbols nosymbols
+ // [a-$] error error (ambiguous)
+ // [a$] anchor anchor
+ // [a-$x] var "x"* literal '$'
+ // [a-$.] error literal '$'
+ // *We won't get here in the case of var "x"
+ backup = chars.getPos(backup);
+ c = chars.next(opts);
+ literal = chars.isEscaped();
+ boolean anchor = (c == ']' && !literal);
+ if (symbols == null && !anchor) {
+ c = SymbolTable.SYMBOL_REF;
+ chars.setPos(backup);
+ break; // literal '$'
+ }
+ if (anchor && op == 0) {
+ if (lastItem == 1) {
+ add_unchecked(lastChar, lastChar);
+ _appendToPat(patBuf, lastChar, false);
+ }
+ add_unchecked(UnicodeMatcher.ETHER);
+ usePat = true;
+ patBuf.append(SymbolTable.SYMBOL_REF).append(']');
+ mode = 2;
+ continue;
+ }
+ syntaxError(chars, "Unquoted '$'");
+ default:
+ break;
+ }
+ }
+
+ // -------- Parse literal characters. This includes both
+ // escaped chars ("\u4E01") and non-syntax characters
+ // ("a").
+
+ switch (lastItem) {
+ case 0:
+ lastItem = 1;
+ lastChar = c;
+ break;
+ case 1:
+ if (op == '-') {
+ if (lastChar >= c) {
+ // Don't allow redundant (a-a) or empty (b-a) ranges;
+ // these are most likely typos.
+ syntaxError(chars, "Invalid range");
+ }
+ add_unchecked(lastChar, c);
+ _appendToPat(patBuf, lastChar, false);
+ patBuf.append(op);
+ _appendToPat(patBuf, c, false);
+ lastItem = op = 0;
+ } else {
+ add_unchecked(lastChar, lastChar);
+ _appendToPat(patBuf, lastChar, false);
+ lastChar = c;
+ }
+ break;
+ case 2:
+ if (op != 0) {
+ syntaxError(chars, "Set expected after operator");
+ }
+ lastChar = c;
+ lastItem = 1;
+ break;
+ }
+ }
+
+ if (mode != 2) {
+ syntaxError(chars, "Missing ']'");
+ }
+
+ chars.skipIgnored(opts);
+
+ /**
+ * Handle global flags (invert, case insensitivity). If this
+ * pattern should be compiled case-insensitive, then we need
+ * to close over case BEFORE COMPLEMENTING. This makes
+ * patterns like /[^abc]/i work.
+ */
+ if ((options & CASE) != 0) {
+ closeOver(CASE);
+ }
+ if (invert) {
+ complement();
+ }
+
+ // Use the rebuilt pattern (pat) only if necessary. Prefer the
+ // generated pattern.
+ if (usePat) {
+ rebuiltPat.append(patBuf.toString());
+ } else {
+ _generatePattern(rebuiltPat, false, true);
+ }
+ }
+
+ private static void syntaxError(RuleCharacterIterator chars, String msg) {
+ throw new IllegalArgumentException("Error: " + msg + " at \"" +
+ Utility.escape(chars.toString()) +
+ '"');
+ }
+
+ /**
+ * Add the contents of the UnicodeSet (as strings) into a collection.
+ * @param target collection to add into
+ * @stable ICU 2.8
+ */
+ public void addAllTo(Collection target) {
+ UnicodeSetIterator it = new UnicodeSetIterator(this);
+ while (it.next()) {
+ target.add(it.getString());
+ }
+ }
+
+ /**
+ * Add the contents of the collection (as strings) into this UnicodeSet.
+ * @param source the collection to add
+ * @stable ICU 2.8
+ */
+ public void addAll(Collection source) {
+ checkFrozen();
+ Iterator it = source.iterator();
+ while (it.hasNext()) {
+ add(it.next().toString());
+ }
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Utility methods
+ //----------------------------------------------------------------
+
+ private void ensureCapacity(int newLen) {
+ if (newLen <= list.length) return;
+ int[] temp = new int[newLen + GROW_EXTRA];
+ System.arraycopy(list, 0, temp, 0, len);
+ list = temp;
+ }
+
+ private void ensureBufferCapacity(int newLen) {
+ if (buffer != null && newLen <= buffer.length) return;
+ buffer = new int[newLen + GROW_EXTRA];
+ }
+
+ /**
+ * Assumes start <= end.
+ */
+ private int[] range(int start, int end) {
+ if (rangeList == null) {
+ rangeList = new int[] { start, end+1, HIGH };
+ } else {
+ rangeList[0] = start;
+ rangeList[1] = end+1;
+ }
+ return rangeList;
+ }
+
+ //----------------------------------------------------------------
+ // Implementation: Fundamental operations
+ //----------------------------------------------------------------
+
+ // polarity = 0, 3 is normal: x xor y
+ // polarity = 1, 2: x xor ~y == x === y
+
+ private UnicodeSet xor(int[] other, int otherLen, int polarity) {
+ ensureBufferCapacity(len + otherLen);
+ int i = 0, j = 0, k = 0;
+ int a = list[i++];
+ int b;
+ if (polarity == 1 || polarity == 2) {
+ b = LOW;
+ if (other[j] == LOW) { // skip base if already LOW
+ ++j;
+ b = other[j];
+ }
+ } else {
+ b = other[j++];
+ }
+ // simplest of all the routines
+ // sort the values, discarding identicals!
+ while (true) {
+ if (a < b) {
+ buffer[k++] = a;
+ a = list[i++];
+ } else if (b < a) {
+ buffer[k++] = b;
+ b = other[j++];
+ } else if (a != HIGH) { // at this point, a == b
+ // discard both values!
+ a = list[i++];
+ b = other[j++];
+ } else { // DONE!
+ buffer[k++] = HIGH;
+ len = k;
+ break;
+ }
+ }
+ // swap list and buffer
+ int[] temp = list;
+ list = buffer;
+ buffer = temp;
+ pat = null;
+ return this;
+ }
+
+ // polarity = 0 is normal: x union y
+ // polarity = 2: x union ~y
+ // polarity = 1: ~x union y
+ // polarity = 3: ~x union ~y
+
+ private UnicodeSet add(int[] other, int otherLen, int polarity) {
+ ensureBufferCapacity(len + otherLen);
+ int i = 0, j = 0, k = 0;
+ int a = list[i++];
+ int b = other[j++];
+ // change from xor is that we have to check overlapping pairs
+ // polarity bit 1 means a is second, bit 2 means b is.
+ main:
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; take lower if unequal
+ if (a < b) { // take a
+ // Back up over overlapping ranges in buffer[]
+ if (k > 0 && a <= buffer[k-1]) {
+ // Pick latter end value in buffer[] vs. list[]
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++; // Common if/else code factored out
+ polarity ^= 1;
+ } else if (b < a) { // take b
+ if (k > 0 && b <= buffer[k-1]) {
+ b = max(other[j], buffer[--k]);
+ } else {
+ buffer[k++] = b;
+ b = other[j];
+ }
+ j++;
+ polarity ^= 2;
+ } else { // a == b, take a, drop b
+ if (a == HIGH) break main;
+ // This is symmetrical; it doesn't matter if
+ // we backtrack with a or b. - liu
+ if (k > 0 && a <= buffer[k-1]) {
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++;
+ polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 3: // both second; take higher if unequal, and drop other
+ if (b <= a) { // take a
+ if (a == HIGH) break main;
+ buffer[k++] = a;
+ } else { // take b
+ if (b == HIGH) break main;
+ buffer[k++] = b;
+ }
+ a = list[i++]; polarity ^= 1; // factored common code
+ b = other[j++]; polarity ^= 2;
+ break;
+ case 1: // a second, b first; if b < a, overlap
+ if (a < b) { // no overlap, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ }
+ }
+ buffer[k++] = HIGH; // terminate
+ len = k;
+ // swap list and buffer
+ int[] temp = list;
+ list = buffer;
+ buffer = temp;
+ pat = null;
+ return this;
+ }
+
+ // polarity = 0 is normal: x intersect y
+ // polarity = 2: x intersect ~y == set-minus
+ // polarity = 1: ~x intersect y
+ // polarity = 3: ~x intersect ~y
+
+ private UnicodeSet retain(int[] other, int otherLen, int polarity) {
+ ensureBufferCapacity(len + otherLen);
+ int i = 0, j = 0, k = 0;
+ int a = list[i++];
+ int b = other[j++];
+ // change from xor is that we have to check overlapping pairs
+ // polarity bit 1 means a is second, bit 2 means b is.
+ main:
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; drop the smaller
+ if (a < b) { // drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 3: // both second; take lower if unequal
+ if (a < b) { // take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 1: // a second, b first;
+ if (a < b) { // NO OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, drop b
+ b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ }
+ }
+ buffer[k++] = HIGH; // terminate
+ len = k;
+ // swap list and buffer
+ int[] temp = list;
+ list = buffer;
+ buffer = temp;
+ pat = null;
+ return this;
+ }
+
+ private static final int max(int a, int b) {
+ return (a > b) ? a : b;
+ }
+
+ //----------------------------------------------------------------
+ // Generic filter-based scanning code
+ //----------------------------------------------------------------
+
+ private static interface Filter {
+ boolean contains(int codePoint);
+ }
+
+ private static class NumericValueFilter implements Filter {
+ double value;
+ NumericValueFilter(double value) { this.value = value; }
+ public boolean contains(int ch) {
+ return UCharacter.getUnicodeNumericValue(ch) == value;
+ }
+ }
+
+ private static class GeneralCategoryMaskFilter implements Filter {
+ int mask;
+ GeneralCategoryMaskFilter(int mask) { this.mask = mask; }
+ public boolean contains(int ch) {
+ return ((1 << UCharacter.getType(ch)) & mask) != 0;
+ }
+ }
+
+ private static class IntPropertyFilter implements Filter {
+ int prop;
+ int value;
+ IntPropertyFilter(int prop, int value) {
+ this.prop = prop;
+ this.value = value;
+ }
+ public boolean contains(int ch) {
+ return UCharacter.getIntPropertyValue(ch, prop) == value;
+ }
+ }
+
+ // VersionInfo for unassigned characters
+ static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+
+ private static class VersionFilter implements Filter {
+ VersionInfo version;
+ VersionFilter(VersionInfo version) { this.version = version; }
+ public boolean contains(int ch) {
+ VersionInfo v = UCharacter.getAge(ch);
+ // Reference comparison ok; VersionInfo caches and reuses
+ // unique objects.
+ return v != NO_VERSION &&
+ v.compareTo(version) <= 0;
+ }
+ }
+
+ private static synchronized UnicodeSet getInclusions(int src) {
+ if (INCLUSIONS == null) {
+ INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+ }
+ if(INCLUSIONS[src] == null) {
+ UnicodeSet incl = new UnicodeSet();
+ switch(src) {
+ case UCharacterProperty.SRC_CHAR:
+ UCharacterProperty.getInstance().addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_PROPSVEC:
+ UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_CHAR_AND_PROPSVEC:
+ UCharacterProperty.getInstance().addPropertyStarts(incl);
+ UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_HST:
+ UCharacterProperty.getInstance().uhst_addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_NORM:
+ NormalizerImpl.addPropertyStarts(incl);
+ break;
+ case UCharacterProperty.SRC_CASE:
+ try {
+ UCaseProps.getSingleton().addPropertyStarts(incl);
+ } catch(IOException e) {
+ throw new MissingResourceException(e.getMessage(),"","");
+ }
+ break;
+ case UCharacterProperty.SRC_BIDI:
+ try {
+ UBiDiProps.getSingleton().addPropertyStarts(incl);
+ } catch(IOException e) {
+ throw new MissingResourceException(e.getMessage(),"","");
+ }
+ break;
+ default:
+ throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
+ }
+ INCLUSIONS[src] = incl;
+ }
+ return INCLUSIONS[src];
+ }
+
+ /**
+ * Generic filter-based scanning code for UCD property UnicodeSets.
+ */
+ private UnicodeSet applyFilter(Filter filter, int src) {
+ // Walk through all Unicode characters, noting the start
+ // and end of each range for which filter.contain(c) is
+ // true. Add each range to a set.
+ //
+ // To improve performance, use the INCLUSIONS set, which
+ // encodes information about character ranges that are known
+ // to have identical properties, such as the CJK Ideographs
+ // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
+ // except the first characters of such ranges.
+ //
+ // TODO Where possible, instead of scanning over code points,
+ // use internal property data to initialize UnicodeSets for
+ // those properties. Scanning code points is slow.
+
+ clear();
+
+ int startHasProperty = -1;
+ UnicodeSet inclusions = getInclusions(src);
+ int limitRange = inclusions.getRangeCount();
+
+ for (int j=0; j