2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 package com.ibm.icu.impl;
10 import java.io.BufferedInputStream;
11 import java.io.DataInputStream;
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.util.Iterator;
15 import java.util.MissingResourceException;
17 import com.ibm.icu.lang.UCharacter;
18 import com.ibm.icu.lang.UCharacter.HangulSyllableType;
19 import com.ibm.icu.lang.UCharacter.NumericType;
20 import com.ibm.icu.lang.UCharacterCategory;
21 import com.ibm.icu.lang.UProperty;
22 import com.ibm.icu.lang.UScript;
23 import com.ibm.icu.text.UTF16;
24 import com.ibm.icu.text.UnicodeSet;
25 import com.ibm.icu.util.VersionInfo;
28 * <p>Internal class used for Unicode character property database.</p>
29 * <p>This classes store binary data read from uprops.icu.
30 * It does not have the capability to parse the data into more high-level
31 * information. It only returns bytes of information when required.</p>
32 * <p>Due to the form most commonly used for retrieval, array of char is used
33 * to store the binary data.</p>
34 * <p>UCharacterPropertyDB also contains information on accessing indexes to
35 * significant points in the binary data.</p>
36 * <p>Responsibility for molding the binary data into more meaning form lies on
37 * <a href=UCharacter.html>UCharacter</a>.</p>
38 * @author Syn Wee Quek
39 * @since release 2.1, february 1st 2002
42 public final class UCharacterProperty
44 // public data members -----------------------------------------------
47 * public singleton instance
49 public static final UCharacterProperty INSTANCE;
54 public Trie2_16 m_trie_;
58 public VersionInfo m_unicodeVersion_;
60 * Latin capital letter i with dot above
62 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
64 * Latin small letter i with dot above
66 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
70 public static final char LATIN_SMALL_LETTER_I_ = 0x69;
74 public static final int TYPE_MASK = 0x1F;
76 // uprops.h enum UPropertySource --------------------------------------- ***
78 /** No source, not a supported property. */
79 public static final int SRC_NONE=0;
80 /** From uchar.c/uprops.icu main trie */
81 public static final int SRC_CHAR=1;
82 /** From uchar.c/uprops.icu properties vectors trie */
83 public static final int SRC_PROPSVEC=2;
84 /** From unames.c/unames.icu */
85 public static final int SRC_NAMES=3;
86 /** From ucase.c/ucase.icu */
87 public static final int SRC_CASE=4;
88 /** From ubidi_props.c/ubidi.icu */
89 public static final int SRC_BIDI=5;
90 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
91 public static final int SRC_CHAR_AND_PROPSVEC=6;
92 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
93 public static final int SRC_CASE_AND_NORM=7;
94 /** From normalizer2impl.cpp/nfc.nrm */
95 public static final int SRC_NFC=8;
96 /** From normalizer2impl.cpp/nfkc.nrm */
97 public static final int SRC_NFKC=9;
98 /** From normalizer2impl.cpp/nfkc_cf.nrm */
99 public static final int SRC_NFKC_CF=10;
100 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
101 public static final int SRC_NFC_CANON_ITER=11;
102 /** One more than the highest UPropertySource (SRC_) constant. */
103 public static final int SRC_COUNT=12;
105 // public methods ----------------------------------------------------
108 * Gets the main property value for code point ch.
109 * @param ch code point whose property value is to be retrieved
110 * @return property value of code point
112 public final int getProperty(int ch)
114 return m_trie_.get(ch);
118 * Gets the unicode additional properties.
119 * Java version of C u_getUnicodeProperties().
120 * @param codepoint codepoint whose additional properties is to be
122 * @param column The column index.
123 * @return unicode properties
125 public int getAdditional(int codepoint, int column) {
127 if (column >= m_additionalColumnsCount_) {
130 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
133 static final int MY_MASK = UCharacterProperty.TYPE_MASK
134 & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
135 (1<<UCharacterCategory.LOWERCASE_LETTER) |
136 (1<<UCharacterCategory.TITLECASE_LETTER) |
137 (1<<UCharacterCategory.MODIFIER_LETTER) |
138 (1<<UCharacterCategory.OTHER_LETTER));
142 * <p>Get the "age" of the code point.</p>
143 * <p>The "age" is the Unicode version when the code point was first
144 * designated (as a non-character or for Private Use) or assigned a
146 * <p>This can be useful to avoid emitting code points to receiving
147 * processes that do not accept newer characters.</p>
148 * <p>The data is from the UCD file DerivedAge.txt.</p>
149 * <p>This API does not check the validity of the codepoint.</p>
150 * @param codepoint The code point.
151 * @return the Unicode version number
153 public VersionInfo getAge(int codepoint)
155 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
156 return VersionInfo.getInstance(
157 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
158 version & LAST_NIBBLE_MASK_, 0, 0);
161 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
162 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
163 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
164 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
165 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
166 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
167 /** Mask constant for multiple UCharCategory bits (Z Separators). */
168 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
172 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
173 * with space=\p{Whitespace} and Control=Cc.
174 * Implements UCHAR_POSIX_GRAPH.
177 private static final boolean isgraphPOSIX(int c) {
178 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
179 /* comparing ==0 returns FALSE for the categories mentioned */
180 return (getMask(UCharacter.getType(c))&
181 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
185 // binary properties --------------------------------------------------- ***
187 private class BinaryProperty {
188 int column; // SRC_PROPSVEC column, or "source" if mask==0
190 BinaryProperty(int column, int mask) {
194 BinaryProperty(int source) {
198 final int getSource() {
199 return mask==0 ? column : SRC_PROPSVEC;
201 boolean contains(int c) {
202 // systematic, directly stored properties
203 return (getAdditional(c, column)&mask)!=0;
207 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties
209 CaseBinaryProperty(int which) {
213 boolean contains(int c) {
214 return UCaseProps.INSTANCE.hasBinaryProperty(c, which);
218 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties
220 NormInertBinaryProperty(int source, int which) {
224 boolean contains(int c) {
225 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
229 BinaryProperty[] binProps={
231 * Binary-property implementations must be in order of corresponding UProperty,
232 * and there must be exactly one entry per binary UProperty.
234 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)),
235 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)),
236 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL
237 boolean contains(int c) {
238 return UBiDiProps.INSTANCE.isBidiControl(c);
241 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED
242 boolean contains(int c) {
243 return UBiDiProps.INSTANCE.isMirrored(c);
246 new BinaryProperty(1, (1<<DASH_PROPERTY_)),
247 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)),
248 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)),
249 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)),
250 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)),
251 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION
252 boolean contains(int c) {
253 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
254 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl;
255 return impl.isCompNo(impl.getNorm16(c));
258 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)),
259 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)),
260 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)),
261 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)),
262 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)),
263 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)),
264 new BinaryProperty(1, (1<<ID_START_PROPERTY_)),
265 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)),
266 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)),
267 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)),
268 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL
269 boolean contains(int c) {
270 return UBiDiProps.INSTANCE.isJoinControl(c);
273 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)),
274 new CaseBinaryProperty(UProperty.LOWERCASE),
275 new BinaryProperty(1, (1<<MATH_PROPERTY_)),
276 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)),
277 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)),
278 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)),
279 new CaseBinaryProperty(UProperty.SOFT_DOTTED),
280 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)),
281 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)),
282 new CaseBinaryProperty(UProperty.UPPERCASE),
283 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)),
284 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)),
285 new BinaryProperty(1, (1<<XID_START_PROPERTY_)),
286 new CaseBinaryProperty(UProperty.CASE_SENSITIVE),
287 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)),
288 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)),
289 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT),
290 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT),
291 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT),
292 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT),
293 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER
294 boolean contains(int c) {
295 return Norm2AllModes.getNFCInstance().impl.
296 ensureCanonIterData().isCanonSegmentStarter(c);
299 new BinaryProperty(1, (1<<PATTERN_SYNTAX)),
300 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)),
301 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM
302 boolean contains(int c) {
303 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c);
306 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK
307 boolean contains(int c) {
308 // "horizontal space"
310 return c==9 || c==0x20; /* TAB or SPACE */
313 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
317 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH
318 boolean contains(int c) {
319 return isgraphPOSIX(c);
322 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT
323 boolean contains(int c) {
325 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
327 * The only cntrl character in graph+blank is TAB (in blank).
328 * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
330 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
333 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT
334 boolean contains(int c) {
335 /* check ASCII and Fullwidth ASCII a-fA-F */
337 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
338 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
342 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
345 new CaseBinaryProperty(UProperty.CASED),
346 new CaseBinaryProperty(UProperty.CASE_IGNORABLE),
347 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED),
348 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED),
349 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED),
350 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED
351 boolean contains(int c) {
352 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c);
354 /* c has a decomposition */
355 c=nfd.codePointAt(0);
356 if(Character.charCount(c)!=nfd.length()) {
357 /* multiple code points */
361 return false; /* protect against bad input */
364 /* single code point */
365 UCaseProps csp=UCaseProps.INSTANCE;
366 UCaseProps.dummyStringBuilder.setLength(0);
367 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder,
368 UCharacter.FOLD_CASE_DEFAULT)>=0;
370 String folded=UCharacter.foldCase(nfd, true);
371 return !folded.equals(nfd);
375 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED),
376 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
377 boolean contains(int c) {
378 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl;
379 String src=UTF16.valueOf(c);
380 StringBuilder dest=new StringBuilder();
381 // Small destCapacity for NFKC_CF(c).
382 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5);
383 kcf.compose(src, 0, src.length(), false, true, buffer);
384 return !Normalizer2Impl.UTF16Plus.equal(dest, src);
389 public boolean hasBinaryProperty(int c, int which) {
390 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) {
391 // not a known binary property
394 return binProps[which].contains(c);
398 // int-value and enumerated properties --------------------------------- ***
400 public int getType(int c) {
401 return getProperty(c)&TYPE_MASK;
405 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
406 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
408 private static final int /* UHangulSyllableType */ gcbToHst[]={
409 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
410 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
411 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
412 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
413 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
414 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
415 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
416 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
417 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
418 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
420 * Omit GCB values beyond what we need for hst.
421 * The code below checks for the array length.
425 private class IntProperty {
426 int column; // SRC_PROPSVEC column, or "source" if mask==0
429 IntProperty(int column, int mask, int shift) {
434 IntProperty(int source) {
438 final int getSource() {
439 return mask==0 ? column : SRC_PROPSVEC;
441 int getValue(int c) {
442 // systematic, directly stored properties
443 return (getAdditional(c, column)&mask)>>>shift;
445 int getMaxValue(int which) {
446 return (getMaxValues(column)&mask)>>>shift;
450 private class BiDiIntProperty extends IntProperty {
454 int getMaxValue(int which) {
455 return UBiDiProps.INSTANCE.getMaxValue(which);
459 private class CombiningClassIntProperty extends IntProperty {
460 CombiningClassIntProperty(int source) {
463 int getMaxValue(int which) {
468 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
471 NormQuickCheckIntProperty(int source, int which, int max) {
476 int getValue(int c) {
477 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c);
479 int getMaxValue(int which) {
484 IntProperty intProps[]={
485 new BiDiIntProperty() { // BIDI_CLASS
486 int getValue(int c) {
487 return UBiDiProps.INSTANCE.getClass(c);
490 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
491 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS
492 int getValue(int c) {
493 return Norm2AllModes.getNFCInstance().decomp.getCombiningClass(c);
496 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0),
497 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_),
498 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY
499 int getValue(int c) {
502 int getMaxValue(int which) {
503 return UCharacterCategory.CHAR_CATEGORY_COUNT-1;
506 new BiDiIntProperty() { // JOINING_GROUP
507 int getValue(int c) {
508 return UBiDiProps.INSTANCE.getJoiningGroup(c);
511 new BiDiIntProperty() { // JOINING_TYPE
512 int getValue(int c) {
513 return UBiDiProps.INSTANCE.getJoiningType(c);
516 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK
517 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE
518 int getValue(int c) {
519 return ntvGetType(getNumericTypeValue(getProperty(c)));
521 int getMaxValue(int which) {
522 return NumericType.COUNT-1;
525 new IntProperty(0, SCRIPT_MASK_, 0) {
526 int getValue(int c) {
527 return UScript.getScript(c);
530 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
531 int getValue(int c) {
532 /* see comments on gcbToHst[] above */
533 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
534 if(gcb<gcbToHst.length) {
535 return gcbToHst[gcb];
537 return HangulSyllableType.NOT_APPLICABLE;
540 int getMaxValue(int which) {
541 return HangulSyllableType.COUNT-1;
544 // max=1=YES -- these are never "maybe", only "no" or "yes"
545 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1),
546 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1),
548 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2),
549 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2),
550 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS
551 int getValue(int c) {
552 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8;
555 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS
556 int getValue(int c) {
557 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
560 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK
561 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK
562 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK
563 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
564 int getValue(int c) {
565 return UBiDiProps.INSTANCE.getPairedBracketType(c);
570 public int getIntPropertyValue(int c, int which) {
571 if(which<UProperty.INT_START) {
572 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
573 return binProps[which].contains(c) ? 1 : 0;
575 } else if(which<UProperty.INT_LIMIT) {
576 return intProps[which-UProperty.INT_START].getValue(c);
577 } else if (which == UProperty.GENERAL_CATEGORY_MASK) {
578 return getMask(getType(c));
580 return 0; // undefined
583 public int getIntPropertyMaxValue(int which) {
584 if(which<UProperty.INT_START) {
585 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
586 return 1; // maximum TRUE for all binary properties
588 } else if(which<UProperty.INT_LIMIT) {
589 return intProps[which-UProperty.INT_START].getMaxValue(which);
591 return -1; // undefined
594 public final int getSource(int which) {
595 if(which<UProperty.BINARY_START) {
596 return SRC_NONE; /* undefined */
597 } else if(which<UProperty.BINARY_LIMIT) {
598 return binProps[which].getSource();
599 } else if(which<UProperty.INT_START) {
600 return SRC_NONE; /* undefined */
601 } else if(which<UProperty.INT_LIMIT) {
602 return intProps[which-UProperty.INT_START].getSource();
603 } else if(which<UProperty.STRING_START) {
605 case UProperty.GENERAL_CATEGORY_MASK:
606 case UProperty.NUMERIC_VALUE:
612 } else if(which<UProperty.STRING_LIMIT) {
617 case UProperty.BIDI_MIRRORING_GLYPH:
620 case UProperty.CASE_FOLDING:
621 case UProperty.LOWERCASE_MAPPING:
622 case UProperty.SIMPLE_CASE_FOLDING:
623 case UProperty.SIMPLE_LOWERCASE_MAPPING:
624 case UProperty.SIMPLE_TITLECASE_MAPPING:
625 case UProperty.SIMPLE_UPPERCASE_MAPPING:
626 case UProperty.TITLECASE_MAPPING:
627 case UProperty.UPPERCASE_MAPPING:
630 case UProperty.ISO_COMMENT:
632 case UProperty.UNICODE_1_NAME:
640 case UProperty.SCRIPT_EXTENSIONS:
643 return SRC_NONE; /* undefined */
649 * Forms a supplementary code point from the argument character<br>
650 * Note this is for internal use hence no checks for the validity of the
651 * surrogate characters are done
652 * @param lead lead surrogate character
653 * @param trail trailing surrogate character
654 * @return code point of the supplementary character
656 public static int getRawSupplementary(char lead, char trail)
658 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
663 * Unicode property names and property value names are compared
664 * "loosely". Property[Value]Aliases.txt say:
666 * "With loose matching of property names, the case distinctions,
667 * whitespace, and '_' are ignored."
671 * This function does just that, for ASCII (char *) name strings.
672 * It is almost identical to ucnv_compareNames() but also ignores
673 * ASCII White_Space characters (U+0009..U+000d).
675 * @param name1 name to compare
676 * @param name2 name to compare
677 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
678 * if name1 is greater than name2.
680 /* to be implemented in 2.4
681 * public static int comparePropertyNames(String name1, String name2)
689 // Ignore delimiters '-', '_', and ASCII White_Space
690 if (i1 < name1.length()) {
691 ch1 = name1.charAt(i1 ++);
693 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
694 || ch1 == '\n' // synwee what is || ch1 == '\v'
695 || ch1 == '\f' || ch1=='\r') {
696 if (i1 < name1.length()) {
697 ch1 = name1.charAt(i1 ++);
703 if (i2 < name2.length()) {
704 ch2 = name2.charAt(i2 ++);
706 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
707 || ch2 == '\n' // synwee what is || ch1 == '\v'
708 || ch2 == '\f' || ch2=='\r') {
709 if (i2 < name2.length()) {
710 ch2 = name2.charAt(i2 ++);
717 // If we reach the ends of both strings then they match
718 if (ch1 == 0 && ch2 == 0) {
722 // Case-insensitive comparison
724 result = Character.toLowerCase(ch1)
725 - Character.toLowerCase(ch2);
735 * Get the the maximum values for some enum/int properties.
736 * @return maximum values for the integer properties.
738 public int getMaxValues(int column)
740 // return m_maxBlockScriptValue_;
744 return m_maxBlockScriptValue_;
746 return m_maxJTGValue_;
754 * @param type character type
757 public static final int getMask(int type)
764 * Returns the digit values of characters like 'A' - 'Z', normal,
765 * half-width and full-width. This method assumes that the other digit
766 * characters are checked by the calling method.
767 * @param ch character to test
768 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
769 * its corresponding digit will be returned.
771 public static int getEuropeanDigit(int ch) {
772 if ((ch > 0x7a && ch < 0xff21)
773 || ch < 0x41 || (ch > 0x5a && ch < 0x61)
774 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
778 // ch >= 0x41 or ch < 0x61
779 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
783 return ch + 10 - 0xff21;
785 // ch >= 0xff41 && ch <= 0xff5a
786 return ch + 10 - 0xff41;
789 public int digit(int c) {
790 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
798 public int getNumericValue(int c) {
799 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
800 int ntv = getNumericTypeValue(getProperty(c));
803 return getEuropeanDigit(c);
804 } else if(ntv<NTV_DIGIT_START_) {
806 return ntv-NTV_DECIMAL_START_;
807 } else if(ntv<NTV_NUMERIC_START_) {
809 return ntv-NTV_DIGIT_START_;
810 } else if(ntv<NTV_FRACTION_START_) {
812 return ntv-NTV_NUMERIC_START_;
813 } else if(ntv<NTV_LARGE_START_) {
816 } else if(ntv<NTV_BASE60_START_) {
817 /* large, single-significant-digit integer */
818 int mant=(ntv>>5)-14;
819 int exp=(ntv&0x1f)+2;
820 if(exp<9 || (exp==9 && mant<=2)) {
829 } else if(ntv<NTV_RESERVED_START_) {
830 /* sexagesimal (base 60) integer */
831 int numValue=(ntv>>2)-0xbf;
836 numValue*=60*60*60*60;
859 public double getUnicodeNumericValue(int c) {
860 // equivalent to c version double u_getNumericValue(UChar32 c)
861 int ntv = getNumericTypeValue(getProperty(c));
864 return UCharacter.NO_NUMERIC_VALUE;
865 } else if(ntv<NTV_DIGIT_START_) {
867 return ntv-NTV_DECIMAL_START_;
868 } else if(ntv<NTV_NUMERIC_START_) {
870 return ntv-NTV_DIGIT_START_;
871 } else if(ntv<NTV_FRACTION_START_) {
873 return ntv-NTV_NUMERIC_START_;
874 } else if(ntv<NTV_LARGE_START_) {
876 int numerator=(ntv>>4)-12;
877 int denominator=(ntv&0xf)+1;
878 return (double)numerator/denominator;
879 } else if(ntv<NTV_BASE60_START_) {
880 /* large, single-significant-digit integer */
882 int mant=(ntv>>5)-14;
883 int exp=(ntv&0x1f)+2;
886 /* multiply by 10^exp without math.h */
907 } else if(ntv<NTV_RESERVED_START_) {
908 /* sexagesimal (base 60) integer */
909 int numValue=(ntv>>2)-0xbf;
914 numValue*=60*60*60*60;
933 return UCharacter.NO_NUMERIC_VALUE;
937 // protected variables -----------------------------------------------
940 * Extra property trie
942 Trie2_16 m_additionalTrie_;
944 * Extra property vectors, 1st column for age and second for binary
947 int m_additionalVectors_[];
949 * Number of additional columns
951 int m_additionalColumnsCount_;
953 * Maximum values for block, bits used as in vector word
956 int m_maxBlockScriptValue_;
958 * Maximum values for script, bits used as in vector word
964 * Script_Extensions data
966 public char[] m_scriptExtensions_;
968 // private variables -------------------------------------------------
971 * Default name of the datafile
973 private static final String DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/uprops.icu";
976 * Default buffer size of datafile
978 private static final int DATA_BUFFER_SIZE_ = 25000;
981 * Shift value for lead surrogate to form a supplementary character.
983 private static final int LEAD_SURROGATE_SHIFT_ = 10;
985 * Offset to add to combined surrogate pair to avoid masking.
987 private static final int SURROGATE_OFFSET_ =
988 UTF16.SUPPLEMENTARY_MIN_VALUE -
989 (UTF16.SURROGATE_MIN_VALUE <<
990 LEAD_SURROGATE_SHIFT_) -
991 UTF16.TRAIL_SURROGATE_MIN_VALUE;
994 // property data constants -------------------------------------------------
997 * Numeric types and values in the main properties words.
999 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
1000 private static final int getNumericTypeValue(int props) {
1001 return props >> NUMERIC_TYPE_VALUE_SHIFT_;
1003 /* constants for the storage form of numeric types and values */
1004 /** No numeric value. */
1005 private static final int NTV_NONE_ = 0;
1006 /** Decimal digits: nv=0..9 */
1007 private static final int NTV_DECIMAL_START_ = 1;
1008 /** Other digits: nv=0..9 */
1009 private static final int NTV_DIGIT_START_ = 11;
1010 /** Small integers: nv=0..154 */
1011 private static final int NTV_NUMERIC_START_ = 21;
1012 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
1013 private static final int NTV_FRACTION_START_ = 0xb0;
1016 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
1017 * (only one significant decimal digit)
1019 private static final int NTV_LARGE_START_ = 0x1e0;
1021 * Sexagesimal numbers:
1022 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
1024 private static final int NTV_BASE60_START_=0x300;
1025 /** No numeric value (yet). */
1026 private static final int NTV_RESERVED_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324
1028 private static final int ntvGetType(int ntv) {
1030 (ntv==NTV_NONE_) ? NumericType.NONE :
1031 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
1032 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
1033 NumericType.NUMERIC;
1037 * Properties in vector word 0
1039 * 31..24 DerivedAge version major/minor one nibble each
1040 * 23..22 3..1: Bits 7..0 = Script_Extensions index
1041 * 3: Script value from Script_Extensions
1042 * 2: Script=Inherited
1044 * 0: Script=bits 7..0
1046 * 19..17 East Asian Width
1052 * Script_Extensions: mask includes Script
1054 public static final int SCRIPT_X_MASK = 0x00c000ff;
1055 //private static final int SCRIPT_X_SHIFT = 22;
1057 * Integer properties mask and shift values for East Asian cell width.
1058 * Equivalent to icu4c UPROPS_EA_MASK
1060 private static final int EAST_ASIAN_MASK_ = 0x000e0000;
1062 * Integer properties mask and shift values for East Asian cell width.
1063 * Equivalent to icu4c UPROPS_EA_SHIFT
1065 private static final int EAST_ASIAN_SHIFT_ = 17;
1067 * Integer properties mask and shift values for blocks.
1068 * Equivalent to icu4c UPROPS_BLOCK_MASK
1070 private static final int BLOCK_MASK_ = 0x0001ff00;
1072 * Integer properties mask and shift values for blocks.
1073 * Equivalent to icu4c UPROPS_BLOCK_SHIFT
1075 private static final int BLOCK_SHIFT_ = 8;
1077 * Integer properties mask and shift values for scripts.
1078 * Equivalent to icu4c UPROPS_SHIFT_MASK
1080 public static final int SCRIPT_MASK_ = 0x000000ff;
1082 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
1083 public static final int SCRIPT_X_WITH_COMMON = 0x400000;
1084 public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
1085 public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
1088 * Additional properties used in internal trie data
1091 * Properties in vector word 1
1092 * Each bit encodes one binary property.
1093 * The following constants represent the bit number, use 1<<UPROPS_XYZ.
1094 * UPROPS_BINARY_1_TOP<=32!
1096 * Keep this list of property enums in sync with
1097 * propListNames[] in icu/source/tools/genprops/props2.c!
1099 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
1101 private static final int WHITE_SPACE_PROPERTY_ = 0;
1102 private static final int DASH_PROPERTY_ = 1;
1103 private static final int HYPHEN_PROPERTY_ = 2;
1104 private static final int QUOTATION_MARK_PROPERTY_ = 3;
1105 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
1106 private static final int MATH_PROPERTY_ = 5;
1107 private static final int HEX_DIGIT_PROPERTY_ = 6;
1108 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
1109 private static final int ALPHABETIC_PROPERTY_ = 8;
1110 private static final int IDEOGRAPHIC_PROPERTY_ = 9;
1111 private static final int DIACRITIC_PROPERTY_ = 10;
1112 private static final int EXTENDER_PROPERTY_ = 11;
1113 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
1114 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
1115 private static final int GRAPHEME_LINK_PROPERTY_ = 14;
1116 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
1117 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
1118 private static final int RADICAL_PROPERTY_ = 17;
1119 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
1120 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
1121 private static final int DEPRECATED_PROPERTY_ = 20;
1122 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
1123 private static final int XID_START_PROPERTY_ = 22;
1124 private static final int XID_CONTINUE_PROPERTY_ = 23;
1125 private static final int ID_START_PROPERTY_ = 24;
1126 private static final int ID_CONTINUE_PROPERTY_ = 25;
1127 private static final int GRAPHEME_BASE_PROPERTY_ = 26;
1128 private static final int S_TERM_PROPERTY_ = 27;
1129 private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
1130 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
1131 private static final int PATTERN_WHITE_SPACE = 30;
1134 * Properties in vector word 2
1138 * 19..15 Sentence Break
1140 * 9.. 5 Grapheme Cluster Break
1141 * 4.. 0 Decomposition Type
1143 private static final int LB_MASK = 0x03f00000;
1144 private static final int LB_SHIFT = 20;
1146 private static final int SB_MASK = 0x000f8000;
1147 private static final int SB_SHIFT = 15;
1149 private static final int WB_MASK = 0x00007c00;
1150 private static final int WB_SHIFT = 10;
1152 private static final int GCB_MASK = 0x000003e0;
1153 private static final int GCB_SHIFT = 5;
1156 * Integer properties mask for decomposition type.
1157 * Equivalent to icu4c UPROPS_DT_MASK.
1159 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
1162 * First nibble shift
1164 private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
1166 * Second nibble mask
1168 private static final int LAST_NIBBLE_MASK_ = 0xF;
1172 private static final int AGE_SHIFT_ = 24;
1175 // private constructors --------------------------------------------------
1179 * @exception IOException thrown when data reading fails or data corrupted
1181 private UCharacterProperty() throws IOException
1183 // consistency check
1184 if(binProps.length!=UProperty.BINARY_LIMIT) {
1185 throw new RuntimeException("binProps.length!=UProperty.BINARY_LIMIT");
1187 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) {
1188 throw new RuntimeException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)");
1192 InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
1193 BufferedInputStream bis = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
1194 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bis, DATA_FORMAT, new IsAcceptable());
1195 DataInputStream ds = new DataInputStream(bis);
1196 // Read or skip the 16 indexes.
1197 int propertyOffset = ds.readInt();
1198 /* exceptionOffset = */ ds.readInt();
1199 /* caseOffset = */ ds.readInt();
1200 int additionalOffset = ds.readInt();
1201 int additionalVectorsOffset = ds.readInt();
1202 m_additionalColumnsCount_ = ds.readInt();
1203 int scriptExtensionsOffset = ds.readInt();
1204 int reservedOffset7 = ds.readInt();
1205 /* reservedOffset8 = */ ds.readInt();
1206 /* dataTopOffset = */ ds.readInt();
1207 m_maxBlockScriptValue_ = ds.readInt();
1208 m_maxJTGValue_ = ds.readInt();
1209 ds.skipBytes((16 - 12) << 2);
1211 // read the main properties trie
1212 m_trie_ = Trie2_16.createFromSerialized(ds);
1213 int expectedTrieLength = (propertyOffset - 16) * 4;
1214 int trieLength = m_trie_.getSerializedLength();
1215 if(trieLength > expectedTrieLength) {
1216 throw new IOException("uprops.icu: not enough bytes for main trie");
1218 // skip padding after trie bytes
1219 ds.skipBytes(expectedTrieLength - trieLength);
1221 // skip unused intervening data structures
1222 ds.skipBytes((additionalOffset - propertyOffset) * 4);
1224 if(m_additionalColumnsCount_ > 0) {
1225 // reads the additional property block
1226 m_additionalTrie_ = Trie2_16.createFromSerialized(ds);
1227 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
1228 trieLength = m_additionalTrie_.getSerializedLength();
1229 if(trieLength > expectedTrieLength) {
1230 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
1232 // skip padding after trie bytes
1233 ds.skipBytes(expectedTrieLength - trieLength);
1235 // additional properties
1236 int size = scriptExtensionsOffset - additionalVectorsOffset;
1237 m_additionalVectors_ = new int[size];
1238 for (int i = 0; i < size; i ++) {
1239 m_additionalVectors_[i] = ds.readInt();
1243 // Script_Extensions
1244 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
1246 m_scriptExtensions_ = new char[numChars];
1247 for(int i = 0; i < numChars; ++i) {
1248 m_scriptExtensions_[i] = ds.readChar();
1254 private static final class IsAcceptable implements ICUBinary.Authenticate {
1255 // @Override when we switch to Java 6
1256 public boolean isDataVersionAcceptable(byte version[]) {
1257 return version[0] == 7;
1260 private static final byte DATA_FORMAT[] = { 0x55, 0x50, 0x72, 0x6F }; // "UPro"
1262 // private methods -------------------------------------------------------
1265 * Compare additional properties to see if it has argument type
1266 * @param property 32 bit properties
1267 * @param type character type
1268 * @return true if property has type
1270 /*private boolean compareAdditionalType(int property, int type)
1272 return (property & (1 << type)) != 0;
1275 // property starts for UnicodeSet -------------------------------------- ***
1277 private static final int TAB = 0x0009;
1278 //private static final int LF = 0x000a;
1279 //private static final int FF = 0x000c;
1280 private static final int CR = 0x000d;
1281 private static final int U_A = 0x0041;
1282 private static final int U_F = 0x0046;
1283 private static final int U_Z = 0x005a;
1284 private static final int U_a = 0x0061;
1285 private static final int U_f = 0x0066;
1286 private static final int U_z = 0x007a;
1287 private static final int DEL = 0x007f;
1288 private static final int NL = 0x0085;
1289 private static final int NBSP = 0x00a0;
1290 private static final int CGJ = 0x034f;
1291 private static final int FIGURESP= 0x2007;
1292 private static final int HAIRSP = 0x200a;
1293 //private static final int ZWNJ = 0x200c;
1294 //private static final int ZWJ = 0x200d;
1295 private static final int RLM = 0x200f;
1296 private static final int NNBSP = 0x202f;
1297 private static final int WJ = 0x2060;
1298 private static final int INHSWAP = 0x206a;
1299 private static final int NOMDIG = 0x206f;
1300 private static final int U_FW_A = 0xff21;
1301 private static final int U_FW_F = 0xff26;
1302 private static final int U_FW_Z = 0xff3a;
1303 private static final int U_FW_a = 0xff41;
1304 private static final int U_FW_f = 0xff46;
1305 private static final int U_FW_z = 0xff5a;
1306 private static final int ZWNBSP = 0xfeff;
1308 public UnicodeSet addPropertyStarts(UnicodeSet set) {
1309 /* add the start code point of each same-value range of the main trie */
1310 Iterator<Trie2.Range> trieIterator = m_trie_.iterator();
1312 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
1313 set.add(range.startCodePoint);
1316 /* add code points with hardcoded properties, plus the ones following them */
1318 /* add for u_isblank() */
1322 /* add for IS_THAT_CONTROL_SPACE() */
1323 set.add(CR+1); /* range TAB..CR */
1329 /* add for u_isIDIgnorable() what was not added above */
1330 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
1338 /* add no-break spaces for u_isWhitespace() what was not added above */
1342 set.add(FIGURESP+1);
1346 /* add for u_charDigitValue() */
1347 // TODO remove when UCharacter.getHanNumericValue() is changed to just return
1348 // Unicode numeric values
1370 /* add for u_digit() */
1380 /* add for u_isxdigit() */
1386 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
1387 set.add(WJ); /* range WJ..NOMDIG */
1393 /* add for UCHAR_GRAPHEME_BASE and others */
1397 return set; // for chaining
1400 public void upropsvec_addPropertyStarts(UnicodeSet set) {
1401 /* add the start code point of each same-value range of the properties vectors trie */
1402 if(m_additionalColumnsCount_>0) {
1403 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
1404 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
1406 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
1407 set.add(range.startCodePoint);
1412 // This static initializer block must be placed after
1413 // other static member initialization
1416 INSTANCE = new UCharacterProperty();
1418 catch (IOException e) {
1419 throw new MissingResourceException(e.getMessage(),"","");
1423 /*----------------------------------------------------------------
1425 *----------------------------------------------------------------*/
1428 * Return a set of characters for property enumeration.
1429 * The set implicitly contains 0x110000 as well, which is one more than the highest
1430 * Unicode code point.
1432 * This set is used as an ordered list - its code points are ordered, and
1433 * consecutive code points (in Unicode code point order) in the set define a range.
1434 * For each two consecutive characters (start, limit) in the set,
1435 * all of the UCD/normalization and related properties for
1436 * all code points start..limit-1 are all the same,
1437 * except for character names and ISO comments.
1439 * All Unicode code points U+0000..U+10ffff are covered by these ranges.
1440 * The ranges define a partition of the Unicode code space.
1441 * ICU uses the inclusions set to enumerate properties for generating
1442 * UnicodeSets containing all code points that have a certain property value.
1444 * The Inclusion List is generated from the UCD. It is generated
1445 * by enumerating the data tries, and code points for hardcoded properties
1446 * are added as well.
1448 * --------------------------------------------------------------------------
1450 * The following are ideas for getting properties-unique code point ranges,
1451 * with possible optimizations beyond the current implementation.
1452 * These optimizations would require more code and be more fragile.
1453 * The current implementation generates one single list (set) for all properties.
1455 * To enumerate properties efficiently, one needs to know ranges of
1456 * repetitive values, so that the value of only each start code point
1457 * can be applied to the whole range.
1458 * This information is in principle available in the uprops.icu/unorm.icu data.
1460 * There are two obstacles:
1462 * 1. Some properties are computed from multiple data structures,
1463 * making it necessary to get repetitive ranges by intersecting
1464 * ranges from multiple tries.
1466 * 2. It is not economical to write code for getting repetitive ranges
1467 * that are precise for each of some 50 properties.
1471 * - Get ranges per trie, not per individual property.
1472 * Each range contains the same values for a whole group of properties.
1473 * This would generate currently five range sets, two for uprops.icu tries
1474 * and three for unorm.icu tries.
1476 * - Combine sets of ranges for multiple tries to get sufficient sets
1477 * for properties, e.g., the uprops.icu main and auxiliary tries
1478 * for all non-normalization properties.
1480 * Ideas for representing ranges and combining them:
1482 * - A UnicodeSet could hold just the start code points of ranges.
1483 * Multiple sets are easily combined by or-ing them together.
1485 * - Alternatively, a UnicodeSet could hold each even-numbered range.
1486 * All ranges could be enumerated by using each start code point
1487 * (for the even-numbered ranges) as well as each limit (end+1) code point
1488 * (for the odd-numbered ranges).
1489 * It should be possible to combine two such sets by xor-ing them,
1490 * but no more than two.
1492 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
1493 * but the first one is certainly simpler and applicable for combining more than
1496 * It is possible to combine all range sets for all uprops/unorm tries into one
1497 * set that can be used for all properties.
1498 * As an optimization, there could be less-combined range sets for certain
1499 * groups of properties.
1500 * The relationship of which less-combined range set to use for which property
1501 * depends on the implementation of the properties and must be hardcoded
1502 * - somewhat error-prone and higher maintenance but can be tested easily
1503 * by building property sets "the simple way" in test code.
1507 * Do not use a UnicodeSet pattern because that causes infinite recursion;
1508 * UnicodeSet depends on the inclusions set.
1512 * getInclusions() is commented out starting 2005-feb-12 because
1513 * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
1514 * and only for the relevant property source.
1517 public UnicodeSet getInclusions() {
1518 UnicodeSet set = new UnicodeSet();
1519 NormalizerImpl.addPropertyStarts(set);
1520 addPropertyStarts(set);