/** ******************************************************************************* * Copyright (C) 1996-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.impl; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.MissingResourceException; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacterCategory; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.VersionInfo; /** *

Internal class used for Unicode character property database.

This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.

Due to the form most commonly used for retrieval, array of char is used * to store the binary data.

UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.

Responsibility for molding the binary data into more meaning form lies on * UCharacter.

* @author Syn Wee Quek * @since release 2.1, february 1st 2002 */ public final class UCharacterProperty { // public data members ----------------------------------------------- /* * public singleton instance */ public static final UCharacterProperty INSTANCE; static { try { INSTANCE = new UCharacterProperty(); } catch (IOException e) { throw new MissingResourceException(e.getMessage(),"",""); } } /** * Trie data */ public CharTrie m_trie_; /** * Optimization * CharTrie index array */ public char[] m_trieIndex_; /** * Optimization * CharTrie data array */ public char[] m_trieData_; /** * Optimization * CharTrie data offset */ public int m_trieInitialValue_; /** * Unicode version */ public VersionInfo m_unicodeVersion_; /** * Latin capital letter i with dot above */ public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; /** * Latin small letter i with dot above */ public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; /** * Latin lowercase i */ public static final char LATIN_SMALL_LETTER_I_ = 0x69; /** * Character type mask */ public static final int TYPE_MASK = 0x1F; // uprops.h enum UPropertySource --------------------------------------- *** /** No source, not a supported property. */ public static final int SRC_NONE=0; /** From uchar.c/uprops.icu main trie */ public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; /** From unames.c/unames.icu */ public static final int SRC_NAMES=3; /** From ucase.c/ucase.icu */ public static final int SRC_CASE=4; /** From ubidi_props.c/ubidi.icu */ public static final int SRC_BIDI=5; /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ public static final int SRC_CHAR_AND_PROPSVEC=6; /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ public static final int SRC_CASE_AND_NORM=7; /** From normalizer2impl.cpp/nfc.nrm */ public static final int SRC_NFC=8; /** From normalizer2impl.cpp/nfkc.nrm */ public static final int SRC_NFKC=9; /** From normalizer2impl.cpp/nfkc_cf.nrm */ public static final int SRC_NFKC_CF=10; /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ public static final int SRC_NFC_CANON_ITER=11; /** One more than the highest UPropertySource (SRC_) constant. */ public static final int SRC_COUNT=12; // public methods ---------------------------------------------------- /** * Java friends implementation */ public void setIndexData(CharTrie.FriendAgent friendagent) { m_trieIndex_ = friendagent.getPrivateIndex(); m_trieData_ = friendagent.getPrivateData(); m_trieInitialValue_ = friendagent.getPrivateInitialValue(); } /** * Gets the property value at the index. * This is optimized. * Note this is a little different from CharTrie the index m_trieData_ * is never negative. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint 0000..D7FF or DC00..FFFF // optimized try { // using try for ch < 0 is faster than using an if statement return m_trieData_[ (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]; } catch (ArrayIndexOutOfBoundsException e) { return m_trieInitialValue_; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { // lead surrogate D800..DBFF return m_trieData_[ (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]; } if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // supplementary code point 10000..10FFFF // look at the construction of supplementary characters // trail forms the ends of it. return m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & Trie.SURROGATE_MASK_)); } // ch is out of bounds // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return m_trieInitialValue_; // this all is an inlined form of return m_trie_.getCodePointValue(ch); } /** * Gets the unicode additional properties. * C version getUnicodeProperties. * @param codepoint codepoint whose additional properties is to be * retrieved * @param column The column index. * @return unicode properties */ public int getAdditional(int codepoint, int column) { if (column == -1) { return getProperty(codepoint); } if (column < 0 || column >= m_additionalColumnsCount_) { return 0; } return m_additionalVectors_[ m_additionalTrie_.getCodePointValue(codepoint) + column]; } static final int MY_MASK = UCharacterProperty.TYPE_MASK & ((1<Get the "age" of the code point.

The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.

This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.

The data is from the UCD file DerivedAge.txt.

This API does not check the validity of the codepoint.

* @param codepoint The code point. * @return the Unicode version number */ public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); } private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); /** Mask constant for multiple UCharCategory bits (Z Separators). */ private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; /** * Checks if c is in * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] * with space=\p{Whitespace} and Control=Cc. * Implements UCHAR_POSIX_GRAPH. * @internal */ private static final boolean isgraphPOSIX(int c) { /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ /* comparing ==0 returns FALSE for the categories mentioned */ return (getMask(UCharacter.getType(c))& (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) ==0; } private static final class BinaryProperties{ int column; int mask; public BinaryProperties(int column, int mask) { this.column = column; this.mask = mask; } } BinaryProperties[] binProps={ /* * column and mask values for binary properties from u_getUnicodeProperties(). * Must be in order of corresponding UProperty, * and there must be exactly one entry per binary UProperty. */ new BinaryProperties( 1, ( 1 << ALPHABETIC_PROPERTY_) ), new BinaryProperties( 1, ( 1 << ASCII_HEX_DIGIT_PROPERTY_) ), new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_BIDI_CONTROL */ new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_BIDI_MIRRORED */ new BinaryProperties( 1, ( 1 << DASH_PROPERTY_) ), new BinaryProperties( 1, ( 1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_) ), new BinaryProperties( 1, ( 1 << DEPRECATED_PROPERTY_) ), new BinaryProperties( 1, ( 1 << DIACRITIC_PROPERTY_) ), new BinaryProperties( 1, ( 1 << EXTENDER_PROPERTY_) ), new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_FULL_COMPOSITION_EXCLUSION */ new BinaryProperties( 1, ( 1 << GRAPHEME_BASE_PROPERTY_) ), new BinaryProperties( 1, ( 1 << GRAPHEME_EXTEND_PROPERTY_) ), new BinaryProperties( 1, ( 1 << GRAPHEME_LINK_PROPERTY_) ), new BinaryProperties( 1, ( 1 << HEX_DIGIT_PROPERTY_) ), new BinaryProperties( 1, ( 1 << HYPHEN_PROPERTY_) ), new BinaryProperties( 1, ( 1 << ID_CONTINUE_PROPERTY_) ), new BinaryProperties( 1, ( 1 << ID_START_PROPERTY_) ), new BinaryProperties( 1, ( 1 << IDEOGRAPHIC_PROPERTY_) ), new BinaryProperties( 1, ( 1 << IDS_BINARY_OPERATOR_PROPERTY_) ), new BinaryProperties( 1, ( 1 << IDS_TRINARY_OPERATOR_PROPERTY_) ), new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_JOIN_CONTROL */ new BinaryProperties( 1, ( 1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_) ), new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_LOWERCASE */ new BinaryProperties( 1, ( 1 << MATH_PROPERTY_) ), new BinaryProperties( 1, ( 1 << NONCHARACTER_CODE_POINT_PROPERTY_) ), new BinaryProperties( 1, ( 1 << QUOTATION_MARK_PROPERTY_) ), new BinaryProperties( 1, ( 1 << RADICAL_PROPERTY_) ), new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_SOFT_DOTTED */ new BinaryProperties( 1, ( 1 << TERMINAL_PUNCTUATION_PROPERTY_) ), new BinaryProperties( 1, ( 1 << UNIFIED_IDEOGRAPH_PROPERTY_) ), new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_UPPERCASE */ new BinaryProperties( 1, ( 1 << WHITE_SPACE_PROPERTY_) ), new BinaryProperties( 1, ( 1 << XID_CONTINUE_PROPERTY_) ), new BinaryProperties( 1, ( 1 << XID_START_PROPERTY_) ), new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CASE_SENSITIVE */ new BinaryProperties( 1, ( 1 << S_TERM_PROPERTY_) ), new BinaryProperties( 1, ( 1 << VARIATION_SELECTOR_PROPERTY_) ), new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_NFD_INERT */ new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKD_INERT */ new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_NFC_INERT */ new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKC_INERT */ new BinaryProperties( SRC_NFC_CANON_ITER, 0 ), /* UCHAR_SEGMENT_STARTER */ new BinaryProperties( 1, ( 1 << PATTERN_SYNTAX) ), new BinaryProperties( 1, ( 1 << PATTERN_WHITE_SPACE) ), new BinaryProperties( SRC_CHAR_AND_PROPSVEC, 0 ), /* UCHAR_POSIX_ALNUM */ new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_BLANK */ new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_GRAPH */ new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_PRINT */ new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_XDIGIT */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CASED */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CASE_IGNORABLE */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CHANGES_WHEN_LOWERCASED */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CHANGES_WHEN_UPPERCASED */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CHANGES_WHEN_TITLECASED */ new BinaryProperties( SRC_CASE_AND_NORM, 0 ), /* UCHAR_CHANGES_WHEN_CASEFOLDED */ new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CHANGES_WHEN_CASEMAPPED */ new BinaryProperties( SRC_NFKC_CF, 0 ), /* UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED */ }; /** *

Check a binary Unicode property for a code point.

Unicode, especially in version 3.2, defines many more properties * than the original set in UnicodeData.txt.

This API is intended to reflect Unicode properties as defined in * the Unicode Character Database (UCD) and Unicode Technical Reports * (UTR).

For details about the properties see * http://www.unicode.org/.

For names of Unicode properties see the UCD file * PropertyAliases.txt.

This API does not check the validity of the codepoint.

Important: If ICU is built with UCD files from Unicode versions * below 3.2, then properties marked with "new" are not or * not fully available.

* @param c Code point to test. * @param which selector constant from com.ibm.icu.lang.UProperty, * identifies which binary property to check. * @return true or false according to the binary Unicode property value * for ch. Also false if property is out of bounds or if the * Unicode version does not have data for the property at all, or * not for this code point. * @see com.ibm.icu.lang.UProperty */ public boolean hasBinaryProperty(int c, int which) { if(which=0x41 && (c<=0x46 || c>=0x61)) || (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) ) { return true; } return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; default: break; } } else if(column==SRC_CHAR_AND_PROPSVEC) { switch(which) { case UProperty.POSIX_ALNUM: return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); default: break; } } else if(column==SRC_CASE_AND_NORM) { String nfd; switch(which) { case UProperty.CHANGES_WHEN_CASEFOLDED: nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); if(nfd!=null) { /* c has a decomposition */ c=nfd.codePointAt(0); if(Character.charCount(c)!=nfd.length()) { /* multiple code points */ c=-1; } } else if(c<0) { return false; /* protect against bad input */ } if(c>=0) { /* single code point */ try { UCaseProps csp=UCaseProps.getSingleton(); UCaseProps.dummyStringBuffer.setLength(0); return csp.toFullFolding(c, UCaseProps.dummyStringBuffer, UCharacter.FOLD_CASE_DEFAULT)>=0; } catch (IOException e) { return false; } } else { String folded=UCharacter.foldCase(nfd, true); return !folded.equals(nfd); } default: break; } } } } return false; } public final int getSource(int which) { if(which * Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } /** *

* Unicode property names and property value names are compared * "loosely". Property[Value]Aliases.txt say: * * "With loose matching of property names, the case distinctions, * whitespace, and '_' are ignored." * *

* This function does just that, for ASCII (char *) name strings. * It is almost identical to ucnv_compareNames() but also ignores * ASCII White_Space characters (U+0009..U+000d). *

* @param name1 name to compare * @param name2 name to compare * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 * if name1 is greater than name2. */ /* to be implemented in 2.4 * public static int comparePropertyNames(String name1, String name2) { int result = 0; int i1 = 0; int i2 = 0; while (true) { char ch1 = 0; char ch2 = 0; // Ignore delimiters '-', '_', and ASCII White_Space if (i1 < name1.length()) { ch1 = name1.charAt(i1 ++); } while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' || ch1 == '\n' // synwee what is || ch1 == '\v' || ch1 == '\f' || ch1=='\r') { if (i1 < name1.length()) { ch1 = name1.charAt(i1 ++); } else { ch1 = 0; } } if (i2 < name2.length()) { ch2 = name2.charAt(i2 ++); } while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' || ch2 == '\n' // synwee what is || ch1 == '\v' || ch2 == '\f' || ch2=='\r') { if (i2 < name2.length()) { ch2 = name2.charAt(i2 ++); } else { ch2 = 0; } } // If we reach the ends of both strings then they match if (ch1 == 0 && ch2 == 0) { return 0; } // Case-insensitive comparison if (ch1 != ch2) { result = Character.toLowerCase(ch1) - Character.toLowerCase(ch2); if (result != 0) { return result; } } } } */ /** * Checks if the argument c is to be treated as a white space in ICU * rules. Usually ICU rule white spaces are ignored unless quoted. * Equivalent to test for Pattern_White_Space Unicode property. * Stable set of characters, won't change. * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ * @param c codepoint to check * @return true if c is a ICU white space */ public static boolean isRuleWhiteSpace(int c) { /* "white space" in the sense of ICU rule parsers This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 Equivalent to test for Pattern_White_Space Unicode property. */ return (c >= 0x0009 && c <= 0x2029 && (c <= 0x000D || c == 0x0020 || c == 0x0085 || c == 0x200E || c == 0x200F || c >= 0x2028)); } /** * Get the the maximum values for some enum/int properties. * @return maximum values for the integer properties. */ public int getMaxValues(int column) { // return m_maxBlockScriptValue_; switch(column) { case 0: return m_maxBlockScriptValue_; case 2: return m_maxJTGValue_; default: return 0; } } /** * Gets the type mask * @param type character type * @return mask */ public static final int getMask(int type) { return 1 << type; } // protected variables ----------------------------------------------- /** * Extra property trie */ CharTrie m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. */ int m_additionalVectors_[]; /** * Number of additional columns */ int m_additionalColumnsCount_; /** * Maximum values for block, bits used as in vector word * 0 */ int m_maxBlockScriptValue_; /** * Maximum values for script, bits used as in vector word * 0 */ int m_maxJTGValue_; // private variables ------------------------------------------------- /** * Default name of the datafile */ private static final String DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/uprops.icu"; /** * Default buffer size of datafile */ private static final int DATA_BUFFER_SIZE_ = 25000; /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid msking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - (UTF16.SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; // additional properties ---------------------------------------------- /** * Additional properties used in internal trie data */ /* * Properties in vector word 1 * Each bit encodes one binary property. * The following constants represent the bit number, use 1<0) { /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); while(propsVectorsIter.next(propsVectorsResult)){ set.add(propsVectorsResult.start); } } } /*---------------------------------------------------------------- * Inclusions list *----------------------------------------------------------------*/ /* * Return a set of characters for property enumeration. * The set implicitly contains 0x110000 as well, which is one more than the highest * Unicode code point. * * This set is used as an ordered list - its code points are ordered, and * consecutive code points (in Unicode code point order) in the set define a range. * For each two consecutive characters (start, limit) in the set, * all of the UCD/normalization and related properties for * all code points start..limit-1 are all the same, * except for character names and ISO comments. * * All Unicode code points U+0000..U+10ffff are covered by these ranges. * The ranges define a partition of the Unicode code space. * ICU uses the inclusions set to enumerate properties for generating * UnicodeSets containing all code points that have a certain property value. * * The Inclusion List is generated from the UCD. It is generated * by enumerating the data tries, and code points for hardcoded properties * are added as well. * * -------------------------------------------------------------------------- * * The following are ideas for getting properties-unique code point ranges, * with possible optimizations beyond the current implementation. * These optimizations would require more code and be more fragile. * The current implementation generates one single list (set) for all properties. * * To enumerate properties efficiently, one needs to know ranges of * repetitive values, so that the value of only each start code point * can be applied to the whole range. * This information is in principle available in the uprops.icu/unorm.icu data. * * There are two obstacles: * * 1. Some properties are computed from multiple data structures, * making it necessary to get repetitive ranges by intersecting * ranges from multiple tries. * * 2. It is not economical to write code for getting repetitive ranges * that are precise for each of some 50 properties. * * Compromise ideas: * * - Get ranges per trie, not per individual property. * Each range contains the same values for a whole group of properties. * This would generate currently five range sets, two for uprops.icu tries * and three for unorm.icu tries. * * - Combine sets of ranges for multiple tries to get sufficient sets * for properties, e.g., the uprops.icu main and auxiliary tries * for all non-normalization properties. * * Ideas for representing ranges and combining them: * * - A UnicodeSet could hold just the start code points of ranges. * Multiple sets are easily combined by or-ing them together. * * - Alternatively, a UnicodeSet could hold each even-numbered range. * All ranges could be enumerated by using each start code point * (for the even-numbered ranges) as well as each limit (end+1) code point * (for the odd-numbered ranges). * It should be possible to combine two such sets by xor-ing them, * but no more than two. * * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, * but the first one is certainly simpler and applicable for combining more than * two range sets. * * It is possible to combine all range sets for all uprops/unorm tries into one * set that can be used for all properties. * As an optimization, there could be less-combined range sets for certain * groups of properties. * The relationship of which less-combined range set to use for which property * depends on the implementation of the properties and must be hardcoded * - somewhat error-prone and higher maintenance but can be tested easily * by building property sets "the simple way" in test code. * * --- * * Do not use a UnicodeSet pattern because that causes infinite recursion; * UnicodeSet depends on the inclusions set. * * --- * * getInclusions() is commented out starting 2005-feb-12 because * UnicodeSet now calls the uxyz_addPropertyStarts() directly, * and only for the relevant property source. */ /* public UnicodeSet getInclusions() { UnicodeSet set = new UnicodeSet(); NormalizerImpl.addPropertyStarts(set); addPropertyStarts(set); return set; } */ }