2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.lang;
\r
10 import java.lang.ref.SoftReference;
\r
11 import java.util.HashMap;
\r
12 import java.util.Locale;
\r
13 import java.util.Map;
\r
15 import com.ibm.icu.impl.IllegalIcuArgumentException;
\r
16 import com.ibm.icu.impl.Norm2AllModes;
\r
17 import com.ibm.icu.impl.Normalizer2Impl;
\r
18 import com.ibm.icu.impl.UBiDiProps;
\r
19 import com.ibm.icu.impl.UCaseProps;
\r
20 import com.ibm.icu.impl.UCharacterName;
\r
21 import com.ibm.icu.impl.UCharacterNameChoice;
\r
22 import com.ibm.icu.impl.UCharacterProperty;
\r
23 import com.ibm.icu.impl.UCharacterUtility;
\r
24 import com.ibm.icu.impl.UPropertyAliases;
\r
25 import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
\r
26 import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection;
\r
27 import com.ibm.icu.text.BreakIterator;
\r
28 import com.ibm.icu.text.UTF16;
\r
29 import com.ibm.icu.util.RangeValueIterator;
\r
30 import com.ibm.icu.util.ULocale;
\r
31 import com.ibm.icu.util.ValueIterator;
\r
32 import com.ibm.icu.util.VersionInfo;
\r
35 * {@icuenhanced java.lang.Character}.{@icu _usage_}
\r
37 * <p>The UCharacter class provides extensions to the
\r
38 * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
\r
39 * java.lang.Character</a> class. These extensions provide support for
\r
40 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
\r
41 * class, provide support for supplementary characters (those with code
\r
42 * points above U+FFFF).
\r
43 * Each ICU release supports the latest version of Unicode available at that time.
\r
45 * <p>Code points are represented in these API using ints. While it would be
\r
46 * more convenient in Java to have a separate primitive datatype for them,
\r
47 * ints suffice in the meantime.
\r
49 * <p>To use this class please add the jar file name icu4j.jar to the
\r
50 * class path, since it contains data files which supply the information used
\r
52 * E.g. In Windows <br>
\r
53 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
\r
54 * Otherwise, another method would be to copy the files uprops.dat and
\r
55 * unames.icu from the icu4j source subdirectory
\r
56 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
\r
57 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
\r
59 * <p>Aside from the additions for UTF-16 support, and the updated Unicode
\r
60 * properties, the main differences between UCharacter and Character are:
\r
62 * <li> UCharacter is not designed to be a char wrapper and does not have
\r
63 * APIs to which involves management of that single char.<br>
\r
66 * <li> char charValue(),
\r
67 * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
\r
69 * <li> UCharacter does not include Character APIs that are deprecated, nor
\r
70 * does it include the Java-specific character information, such as
\r
71 * boolean isJavaIdentifierPart(char ch).
\r
72 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
\r
73 * values '10' - '35'. UCharacter also does this in digit and
\r
74 * getNumericValue, to adhere to the java semantics of these
\r
75 * methods. New methods unicodeDigit, and
\r
76 * getUnicodeNumericValue do not treat the above code points
\r
77 * as having numeric values. This is a semantic change from ICU4J 1.3.1.
\r
80 * Further detail on differences can be determined using the program
\r
82 * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
\r
83 * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
\r
86 * In addition to Java compatibility functions, which calculate derived properties,
\r
87 * this API provides low-level access to the Unicode Character Database.
\r
90 * Unicode assigns each code point (not just assigned character) values for
\r
92 * Most of them are simple boolean flags, or constants from a small enumerated list.
\r
93 * For some properties, values are strings or other relatively more complex types.
\r
96 * For more information see
\r
97 * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
\r
98 * (http://www.unicode.org/ucd/)
\r
99 * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
\r
100 * User Guide chapter on Properties</a>
\r
101 * (http://www.icu-project.org/userguide/properties.html).
\r
104 * There are also functions that provide easy migration from C/POSIX functions
\r
105 * like isblank(). Their use is generally discouraged because the C/POSIX
\r
106 * standards do not define their semantics beyond the ASCII range, which means
\r
107 * that different implementations exhibit very different behavior.
\r
108 * Instead, Unicode properties should be used directly.
\r
111 * There are also only a few, broad C/POSIX character classes, and they tend
\r
112 * to be used for conflicting purposes. For example, the "isalpha()" class
\r
113 * is sometimes used to determine word boundaries, while a more sophisticated
\r
114 * approach would at least distinguish initial letters from continuation
\r
115 * characters (the latter including combining marks).
\r
116 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
\r
117 * Another example: There is no "istitle()" class for titlecase characters.
\r
120 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
\r
121 * ICU implements them according to the Standard Recommendations in
\r
122 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
\r
123 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
\r
126 * API access for C/POSIX character classes is as follows:
\r
128 * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
\r
129 * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
\r
130 * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
\r
131 * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
\r
132 * (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
\r
133 * (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
\r
134 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
\r
135 * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
\r
136 * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
\r
137 * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
\r
138 * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
\r
139 * - cntrl: getType(c)==CONTROL
\r
140 * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
\r
141 * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)}</pre>
\r
144 * The C/POSIX character classes are also available in UnicodeSet patterns,
\r
145 * using patterns like [:graph:] or \p{graph}.
\r
148 * {@icunote} There are several ICU (and Java) whitespace functions.
\r
150 * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
\r
151 * most of general categories "Z" (separators) + most whitespace ISO controls
\r
152 * (including no-break spaces, but excluding IS1..IS4 and ZWSP)
\r
153 * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
\r
154 * <li> isSpaceChar: just Z (including no-break spaces)</ul>
\r
157 * This class is not subclassable.
\r
159 * @author Syn Wee Quek
\r
161 * @see com.ibm.icu.lang.UCharacterEnums
\r
164 public final class UCharacter implements ECharacterCategory, ECharacterDirection
\r
166 // public inner classes ----------------------------------------------
\r
169 * {@icuenhanced java.lang.Character.UnicodeBlock}.{@icu _usage_}
\r
171 * A family of character subsets representing the character blocks in the
\r
172 * Unicode specification, generated from Unicode Data file Blocks.txt.
\r
173 * Character blocks generally define characters used for a specific script
\r
174 * or purpose. A character is contained by at most one Unicode block.
\r
176 * {@icunote} All fields named XXX_ID are specific to ICU.
\r
180 public static final class UnicodeBlock extends Character.Subset
\r
182 // block id corresponding to icu4c -----------------------------------
\r
187 public static final int INVALID_CODE_ID = -1;
\r
191 public static final int BASIC_LATIN_ID = 1;
\r
195 public static final int LATIN_1_SUPPLEMENT_ID = 2;
\r
199 public static final int LATIN_EXTENDED_A_ID = 3;
\r
203 public static final int LATIN_EXTENDED_B_ID = 4;
\r
207 public static final int IPA_EXTENSIONS_ID = 5;
\r
211 public static final int SPACING_MODIFIER_LETTERS_ID = 6;
\r
215 public static final int COMBINING_DIACRITICAL_MARKS_ID = 7;
\r
217 * Unicode 3.2 renames this block to "Greek and Coptic".
\r
220 public static final int GREEK_ID = 8;
\r
224 public static final int CYRILLIC_ID = 9;
\r
228 public static final int ARMENIAN_ID = 10;
\r
232 public static final int HEBREW_ID = 11;
\r
236 public static final int ARABIC_ID = 12;
\r
240 public static final int SYRIAC_ID = 13;
\r
244 public static final int THAANA_ID = 14;
\r
248 public static final int DEVANAGARI_ID = 15;
\r
252 public static final int BENGALI_ID = 16;
\r
256 public static final int GURMUKHI_ID = 17;
\r
260 public static final int GUJARATI_ID = 18;
\r
264 public static final int ORIYA_ID = 19;
\r
268 public static final int TAMIL_ID = 20;
\r
272 public static final int TELUGU_ID = 21;
\r
276 public static final int KANNADA_ID = 22;
\r
280 public static final int MALAYALAM_ID = 23;
\r
284 public static final int SINHALA_ID = 24;
\r
288 public static final int THAI_ID = 25;
\r
292 public static final int LAO_ID = 26;
\r
296 public static final int TIBETAN_ID = 27;
\r
300 public static final int MYANMAR_ID = 28;
\r
304 public static final int GEORGIAN_ID = 29;
\r
308 public static final int HANGUL_JAMO_ID = 30;
\r
312 public static final int ETHIOPIC_ID = 31;
\r
316 public static final int CHEROKEE_ID = 32;
\r
320 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID = 33;
\r
324 public static final int OGHAM_ID = 34;
\r
328 public static final int RUNIC_ID = 35;
\r
332 public static final int KHMER_ID = 36;
\r
336 public static final int MONGOLIAN_ID = 37;
\r
340 public static final int LATIN_EXTENDED_ADDITIONAL_ID = 38;
\r
344 public static final int GREEK_EXTENDED_ID = 39;
\r
348 public static final int GENERAL_PUNCTUATION_ID = 40;
\r
352 public static final int SUPERSCRIPTS_AND_SUBSCRIPTS_ID = 41;
\r
356 public static final int CURRENCY_SYMBOLS_ID = 42;
\r
358 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
\r
362 public static final int COMBINING_MARKS_FOR_SYMBOLS_ID = 43;
\r
366 public static final int LETTERLIKE_SYMBOLS_ID = 44;
\r
370 public static final int NUMBER_FORMS_ID = 45;
\r
374 public static final int ARROWS_ID = 46;
\r
378 public static final int MATHEMATICAL_OPERATORS_ID = 47;
\r
382 public static final int MISCELLANEOUS_TECHNICAL_ID = 48;
\r
386 public static final int CONTROL_PICTURES_ID = 49;
\r
390 public static final int OPTICAL_CHARACTER_RECOGNITION_ID = 50;
\r
394 public static final int ENCLOSED_ALPHANUMERICS_ID = 51;
\r
398 public static final int BOX_DRAWING_ID = 52;
\r
402 public static final int BLOCK_ELEMENTS_ID = 53;
\r
406 public static final int GEOMETRIC_SHAPES_ID = 54;
\r
410 public static final int MISCELLANEOUS_SYMBOLS_ID = 55;
\r
414 public static final int DINGBATS_ID = 56;
\r
418 public static final int BRAILLE_PATTERNS_ID = 57;
\r
422 public static final int CJK_RADICALS_SUPPLEMENT_ID = 58;
\r
426 public static final int KANGXI_RADICALS_ID = 59;
\r
430 public static final int IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID = 60;
\r
434 public static final int CJK_SYMBOLS_AND_PUNCTUATION_ID = 61;
\r
438 public static final int HIRAGANA_ID = 62;
\r
442 public static final int KATAKANA_ID = 63;
\r
446 public static final int BOPOMOFO_ID = 64;
\r
450 public static final int HANGUL_COMPATIBILITY_JAMO_ID = 65;
\r
454 public static final int KANBUN_ID = 66;
\r
458 public static final int BOPOMOFO_EXTENDED_ID = 67;
\r
462 public static final int ENCLOSED_CJK_LETTERS_AND_MONTHS_ID = 68;
\r
466 public static final int CJK_COMPATIBILITY_ID = 69;
\r
470 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID = 70;
\r
474 public static final int CJK_UNIFIED_IDEOGRAPHS_ID = 71;
\r
478 public static final int YI_SYLLABLES_ID = 72;
\r
482 public static final int YI_RADICALS_ID = 73;
\r
486 public static final int HANGUL_SYLLABLES_ID = 74;
\r
490 public static final int HIGH_SURROGATES_ID = 75;
\r
494 public static final int HIGH_PRIVATE_USE_SURROGATES_ID = 76;
\r
498 public static final int LOW_SURROGATES_ID = 77;
\r
500 * Same as public static final int PRIVATE_USE.
\r
501 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
502 * and multiple code point ranges had this block.
\r
503 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
504 * and adds separate blocks for the supplementary PUAs.
\r
507 public static final int PRIVATE_USE_AREA_ID = 78;
\r
509 * Same as public static final int PRIVATE_USE_AREA.
\r
510 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
511 * and multiple code point ranges had this block.
\r
512 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
513 * and adds separate blocks for the supplementary PUAs.
\r
516 public static final int PRIVATE_USE_ID = PRIVATE_USE_AREA_ID;
\r
520 public static final int CJK_COMPATIBILITY_IDEOGRAPHS_ID = 79;
\r
524 public static final int ALPHABETIC_PRESENTATION_FORMS_ID = 80;
\r
528 public static final int ARABIC_PRESENTATION_FORMS_A_ID = 81;
\r
532 public static final int COMBINING_HALF_MARKS_ID = 82;
\r
536 public static final int CJK_COMPATIBILITY_FORMS_ID = 83;
\r
540 public static final int SMALL_FORM_VARIANTS_ID = 84;
\r
544 public static final int ARABIC_PRESENTATION_FORMS_B_ID = 85;
\r
548 public static final int SPECIALS_ID = 86;
\r
552 public static final int HALFWIDTH_AND_FULLWIDTH_FORMS_ID = 87;
\r
556 public static final int OLD_ITALIC_ID = 88;
\r
560 public static final int GOTHIC_ID = 89;
\r
564 public static final int DESERET_ID = 90;
\r
568 public static final int BYZANTINE_MUSICAL_SYMBOLS_ID = 91;
\r
572 public static final int MUSICAL_SYMBOLS_ID = 92;
\r
576 public static final int MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID = 93;
\r
580 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID = 94;
\r
584 public static final int
\r
585 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID = 95;
\r
589 public static final int TAGS_ID = 96;
\r
591 // New blocks in Unicode 3.2
\r
594 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
597 public static final int CYRILLIC_SUPPLEMENTARY_ID = 97;
\r
599 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
603 public static final int CYRILLIC_SUPPLEMENT_ID = 97;
\r
607 public static final int TAGALOG_ID = 98;
\r
611 public static final int HANUNOO_ID = 99;
\r
615 public static final int BUHID_ID = 100;
\r
619 public static final int TAGBANWA_ID = 101;
\r
623 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID = 102;
\r
627 public static final int SUPPLEMENTAL_ARROWS_A_ID = 103;
\r
631 public static final int SUPPLEMENTAL_ARROWS_B_ID = 104;
\r
635 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID = 105;
\r
639 public static final int SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID = 106;
\r
643 public static final int KATAKANA_PHONETIC_EXTENSIONS_ID = 107;
\r
647 public static final int VARIATION_SELECTORS_ID = 108;
\r
651 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID = 109;
\r
655 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID = 110;
\r
660 public static final int LIMBU_ID = 111; /*[1900]*/
\r
664 public static final int TAI_LE_ID = 112; /*[1950]*/
\r
668 public static final int KHMER_SYMBOLS_ID = 113; /*[19E0]*/
\r
672 public static final int PHONETIC_EXTENSIONS_ID = 114; /*[1D00]*/
\r
676 public static final int MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID = 115; /*[2B00]*/
\r
680 public static final int YIJING_HEXAGRAM_SYMBOLS_ID = 116; /*[4DC0]*/
\r
684 public static final int LINEAR_B_SYLLABARY_ID = 117; /*[10000]*/
\r
688 public static final int LINEAR_B_IDEOGRAMS_ID = 118; /*[10080]*/
\r
692 public static final int AEGEAN_NUMBERS_ID = 119; /*[10100]*/
\r
696 public static final int UGARITIC_ID = 120; /*[10380]*/
\r
700 public static final int SHAVIAN_ID = 121; /*[10450]*/
\r
704 public static final int OSMANYA_ID = 122; /*[10480]*/
\r
708 public static final int CYPRIOT_SYLLABARY_ID = 123; /*[10800]*/
\r
712 public static final int TAI_XUAN_JING_SYMBOLS_ID = 124; /*[1D300]*/
\r
716 public static final int VARIATION_SELECTORS_SUPPLEMENT_ID = 125; /*[E0100]*/
\r
718 /* New blocks in Unicode 4.1 */
\r
723 public static final int ANCIENT_GREEK_MUSICAL_NOTATION_ID = 126; /*[1D200]*/
\r
728 public static final int ANCIENT_GREEK_NUMBERS_ID = 127; /*[10140]*/
\r
733 public static final int ARABIC_SUPPLEMENT_ID = 128; /*[0750]*/
\r
738 public static final int BUGINESE_ID = 129; /*[1A00]*/
\r
743 public static final int CJK_STROKES_ID = 130; /*[31C0]*/
\r
748 public static final int COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID = 131; /*[1DC0]*/
\r
753 public static final int COPTIC_ID = 132; /*[2C80]*/
\r
758 public static final int ETHIOPIC_EXTENDED_ID = 133; /*[2D80]*/
\r
763 public static final int ETHIOPIC_SUPPLEMENT_ID = 134; /*[1380]*/
\r
768 public static final int GEORGIAN_SUPPLEMENT_ID = 135; /*[2D00]*/
\r
773 public static final int GLAGOLITIC_ID = 136; /*[2C00]*/
\r
778 public static final int KHAROSHTHI_ID = 137; /*[10A00]*/
\r
783 public static final int MODIFIER_TONE_LETTERS_ID = 138; /*[A700]*/
\r
788 public static final int NEW_TAI_LUE_ID = 139; /*[1980]*/
\r
793 public static final int OLD_PERSIAN_ID = 140; /*[103A0]*/
\r
798 public static final int PHONETIC_EXTENSIONS_SUPPLEMENT_ID = 141; /*[1D80]*/
\r
803 public static final int SUPPLEMENTAL_PUNCTUATION_ID = 142; /*[2E00]*/
\r
808 public static final int SYLOTI_NAGRI_ID = 143; /*[A800]*/
\r
813 public static final int TIFINAGH_ID = 144; /*[2D30]*/
\r
818 public static final int VERTICAL_FORMS_ID = 145; /*[FE10]*/
\r
820 /* New blocks in Unicode 5.0 */
\r
825 public static final int NKO_ID = 146; /*[07C0]*/
\r
829 public static final int BALINESE_ID = 147; /*[1B00]*/
\r
833 public static final int LATIN_EXTENDED_C_ID = 148; /*[2C60]*/
\r
837 public static final int LATIN_EXTENDED_D_ID = 149; /*[A720]*/
\r
841 public static final int PHAGS_PA_ID = 150; /*[A840]*/
\r
845 public static final int PHOENICIAN_ID = 151; /*[10900]*/
\r
849 public static final int CUNEIFORM_ID = 152; /*[12000]*/
\r
853 public static final int CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID = 153; /*[12400]*/
\r
857 public static final int COUNTING_ROD_NUMERALS_ID = 154; /*[1D360]*/
\r
862 public static final int SUNDANESE_ID = 155; /* [1B80] */
\r
867 public static final int LEPCHA_ID = 156; /* [1C00] */
\r
872 public static final int OL_CHIKI_ID = 157; /* [1C50] */
\r
877 public static final int CYRILLIC_EXTENDED_A_ID = 158; /* [2DE0] */
\r
882 public static final int VAI_ID = 159; /* [A500] */
\r
887 public static final int CYRILLIC_EXTENDED_B_ID = 160; /* [A640] */
\r
892 public static final int SAURASHTRA_ID = 161; /* [A880] */
\r
897 public static final int KAYAH_LI_ID = 162; /* [A900] */
\r
902 public static final int REJANG_ID = 163; /* [A930] */
\r
907 public static final int CHAM_ID = 164; /* [AA00] */
\r
912 public static final int ANCIENT_SYMBOLS_ID = 165; /* [10190] */
\r
917 public static final int PHAISTOS_DISC_ID = 166; /* [101D0] */
\r
922 public static final int LYCIAN_ID = 167; /* [10280] */
\r
927 public static final int CARIAN_ID = 168; /* [102A0] */
\r
932 public static final int LYDIAN_ID = 169; /* [10920] */
\r
937 public static final int MAHJONG_TILES_ID = 170; /* [1F000] */
\r
942 public static final int DOMINO_TILES_ID = 171; /* [1F030] */
\r
944 /* New blocks in Unicode 5.2 */
\r
946 /** @stable ICU 4.4 */
\r
947 public static final int SAMARITAN_ID = 172; /*[0800]*/
\r
948 /** @stable ICU 4.4 */
\r
949 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_ID = 173; /*[18B0]*/
\r
950 /** @stable ICU 4.4 */
\r
951 public static final int TAI_THAM_ID = 174; /*[1A20]*/
\r
952 /** @stable ICU 4.4 */
\r
953 public static final int VEDIC_EXTENSIONS_ID = 175; /*[1CD0]*/
\r
954 /** @stable ICU 4.4 */
\r
955 public static final int LISU_ID = 176; /*[A4D0]*/
\r
956 /** @stable ICU 4.4 */
\r
957 public static final int BAMUM_ID = 177; /*[A6A0]*/
\r
958 /** @stable ICU 4.4 */
\r
959 public static final int COMMON_INDIC_NUMBER_FORMS_ID = 178; /*[A830]*/
\r
960 /** @stable ICU 4.4 */
\r
961 public static final int DEVANAGARI_EXTENDED_ID = 179; /*[A8E0]*/
\r
962 /** @stable ICU 4.4 */
\r
963 public static final int HANGUL_JAMO_EXTENDED_A_ID = 180; /*[A960]*/
\r
964 /** @stable ICU 4.4 */
\r
965 public static final int JAVANESE_ID = 181; /*[A980]*/
\r
966 /** @stable ICU 4.4 */
\r
967 public static final int MYANMAR_EXTENDED_A_ID = 182; /*[AA60]*/
\r
968 /** @stable ICU 4.4 */
\r
969 public static final int TAI_VIET_ID = 183; /*[AA80]*/
\r
970 /** @stable ICU 4.4 */
\r
971 public static final int MEETEI_MAYEK_ID = 184; /*[ABC0]*/
\r
972 /** @stable ICU 4.4 */
\r
973 public static final int HANGUL_JAMO_EXTENDED_B_ID = 185; /*[D7B0]*/
\r
974 /** @stable ICU 4.4 */
\r
975 public static final int IMPERIAL_ARAMAIC_ID = 186; /*[10840]*/
\r
976 /** @stable ICU 4.4 */
\r
977 public static final int OLD_SOUTH_ARABIAN_ID = 187; /*[10A60]*/
\r
978 /** @stable ICU 4.4 */
\r
979 public static final int AVESTAN_ID = 188; /*[10B00]*/
\r
980 /** @stable ICU 4.4 */
\r
981 public static final int INSCRIPTIONAL_PARTHIAN_ID = 189; /*[10B40]*/
\r
982 /** @stable ICU 4.4 */
\r
983 public static final int INSCRIPTIONAL_PAHLAVI_ID = 190; /*[10B60]*/
\r
984 /** @stable ICU 4.4 */
\r
985 public static final int OLD_TURKIC_ID = 191; /*[10C00]*/
\r
986 /** @stable ICU 4.4 */
\r
987 public static final int RUMI_NUMERAL_SYMBOLS_ID = 192; /*[10E60]*/
\r
988 /** @stable ICU 4.4 */
\r
989 public static final int KAITHI_ID = 193; /*[11080]*/
\r
990 /** @stable ICU 4.4 */
\r
991 public static final int EGYPTIAN_HIEROGLYPHS_ID = 194; /*[13000]*/
\r
992 /** @stable ICU 4.4 */
\r
993 public static final int ENCLOSED_ALPHANUMERIC_SUPPLEMENT_ID = 195; /*[1F100]*/
\r
994 /** @stable ICU 4.4 */
\r
995 public static final int ENCLOSED_IDEOGRAPHIC_SUPPLEMENT_ID = 196; /*[1F200]*/
\r
996 /** @stable ICU 4.4 */
\r
997 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_ID = 197; /*[2A700]*/
\r
1002 public static final int COUNT = 198;
\r
1004 // blocks objects ---------------------------------------------------
\r
1007 * Array of UnicodeBlocks, for easy access in getInstance(int)
\r
1009 private final static UnicodeBlock BLOCKS_[] = new UnicodeBlock[COUNT];
\r
1014 public static final UnicodeBlock NO_BLOCK
\r
1015 = new UnicodeBlock("NO_BLOCK", 0);
\r
1020 public static final UnicodeBlock BASIC_LATIN
\r
1021 = new UnicodeBlock("BASIC_LATIN", BASIC_LATIN_ID);
\r
1025 public static final UnicodeBlock LATIN_1_SUPPLEMENT
\r
1026 = new UnicodeBlock("LATIN_1_SUPPLEMENT", LATIN_1_SUPPLEMENT_ID);
\r
1030 public static final UnicodeBlock LATIN_EXTENDED_A
\r
1031 = new UnicodeBlock("LATIN_EXTENDED_A", LATIN_EXTENDED_A_ID);
\r
1035 public static final UnicodeBlock LATIN_EXTENDED_B
\r
1036 = new UnicodeBlock("LATIN_EXTENDED_B", LATIN_EXTENDED_B_ID);
\r
1040 public static final UnicodeBlock IPA_EXTENSIONS
\r
1041 = new UnicodeBlock("IPA_EXTENSIONS", IPA_EXTENSIONS_ID);
\r
1045 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
\r
1046 = new UnicodeBlock("SPACING_MODIFIER_LETTERS", SPACING_MODIFIER_LETTERS_ID);
\r
1050 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
\r
1051 = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS", COMBINING_DIACRITICAL_MARKS_ID);
\r
1053 * Unicode 3.2 renames this block to "Greek and Coptic".
\r
1056 public static final UnicodeBlock GREEK
\r
1057 = new UnicodeBlock("GREEK", GREEK_ID);
\r
1061 public static final UnicodeBlock CYRILLIC
\r
1062 = new UnicodeBlock("CYRILLIC", CYRILLIC_ID);
\r
1066 public static final UnicodeBlock ARMENIAN
\r
1067 = new UnicodeBlock("ARMENIAN", ARMENIAN_ID);
\r
1071 public static final UnicodeBlock HEBREW
\r
1072 = new UnicodeBlock("HEBREW", HEBREW_ID);
\r
1076 public static final UnicodeBlock ARABIC
\r
1077 = new UnicodeBlock("ARABIC", ARABIC_ID);
\r
1081 public static final UnicodeBlock SYRIAC
\r
1082 = new UnicodeBlock("SYRIAC", SYRIAC_ID);
\r
1086 public static final UnicodeBlock THAANA
\r
1087 = new UnicodeBlock("THAANA", THAANA_ID);
\r
1091 public static final UnicodeBlock DEVANAGARI
\r
1092 = new UnicodeBlock("DEVANAGARI", DEVANAGARI_ID);
\r
1096 public static final UnicodeBlock BENGALI
\r
1097 = new UnicodeBlock("BENGALI", BENGALI_ID);
\r
1101 public static final UnicodeBlock GURMUKHI
\r
1102 = new UnicodeBlock("GURMUKHI", GURMUKHI_ID);
\r
1106 public static final UnicodeBlock GUJARATI
\r
1107 = new UnicodeBlock("GUJARATI", GUJARATI_ID);
\r
1111 public static final UnicodeBlock ORIYA
\r
1112 = new UnicodeBlock("ORIYA", ORIYA_ID);
\r
1116 public static final UnicodeBlock TAMIL
\r
1117 = new UnicodeBlock("TAMIL", TAMIL_ID);
\r
1121 public static final UnicodeBlock TELUGU
\r
1122 = new UnicodeBlock("TELUGU", TELUGU_ID);
\r
1126 public static final UnicodeBlock KANNADA
\r
1127 = new UnicodeBlock("KANNADA", KANNADA_ID);
\r
1131 public static final UnicodeBlock MALAYALAM
\r
1132 = new UnicodeBlock("MALAYALAM", MALAYALAM_ID);
\r
1136 public static final UnicodeBlock SINHALA
\r
1137 = new UnicodeBlock("SINHALA", SINHALA_ID);
\r
1141 public static final UnicodeBlock THAI
\r
1142 = new UnicodeBlock("THAI", THAI_ID);
\r
1146 public static final UnicodeBlock LAO
\r
1147 = new UnicodeBlock("LAO", LAO_ID);
\r
1151 public static final UnicodeBlock TIBETAN
\r
1152 = new UnicodeBlock("TIBETAN", TIBETAN_ID);
\r
1156 public static final UnicodeBlock MYANMAR
\r
1157 = new UnicodeBlock("MYANMAR", MYANMAR_ID);
\r
1161 public static final UnicodeBlock GEORGIAN
\r
1162 = new UnicodeBlock("GEORGIAN", GEORGIAN_ID);
\r
1166 public static final UnicodeBlock HANGUL_JAMO
\r
1167 = new UnicodeBlock("HANGUL_JAMO", HANGUL_JAMO_ID);
\r
1171 public static final UnicodeBlock ETHIOPIC
\r
1172 = new UnicodeBlock("ETHIOPIC", ETHIOPIC_ID);
\r
1176 public static final UnicodeBlock CHEROKEE
\r
1177 = new UnicodeBlock("CHEROKEE", CHEROKEE_ID);
\r
1181 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
\r
1182 = new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
\r
1183 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID);
\r
1187 public static final UnicodeBlock OGHAM
\r
1188 = new UnicodeBlock("OGHAM", OGHAM_ID);
\r
1192 public static final UnicodeBlock RUNIC
\r
1193 = new UnicodeBlock("RUNIC", RUNIC_ID);
\r
1197 public static final UnicodeBlock KHMER
\r
1198 = new UnicodeBlock("KHMER", KHMER_ID);
\r
1202 public static final UnicodeBlock MONGOLIAN
\r
1203 = new UnicodeBlock("MONGOLIAN", MONGOLIAN_ID);
\r
1207 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
\r
1208 = new UnicodeBlock("LATIN_EXTENDED_ADDITIONAL", LATIN_EXTENDED_ADDITIONAL_ID);
\r
1212 public static final UnicodeBlock GREEK_EXTENDED
\r
1213 = new UnicodeBlock("GREEK_EXTENDED", GREEK_EXTENDED_ID);
\r
1217 public static final UnicodeBlock GENERAL_PUNCTUATION
\r
1218 = new UnicodeBlock("GENERAL_PUNCTUATION", GENERAL_PUNCTUATION_ID);
\r
1222 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
\r
1223 = new UnicodeBlock("SUPERSCRIPTS_AND_SUBSCRIPTS", SUPERSCRIPTS_AND_SUBSCRIPTS_ID);
\r
1227 public static final UnicodeBlock CURRENCY_SYMBOLS
\r
1228 = new UnicodeBlock("CURRENCY_SYMBOLS", CURRENCY_SYMBOLS_ID);
\r
1230 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
\r
1234 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
\r
1235 = new UnicodeBlock("COMBINING_MARKS_FOR_SYMBOLS", COMBINING_MARKS_FOR_SYMBOLS_ID);
\r
1239 public static final UnicodeBlock LETTERLIKE_SYMBOLS
\r
1240 = new UnicodeBlock("LETTERLIKE_SYMBOLS", LETTERLIKE_SYMBOLS_ID);
\r
1244 public static final UnicodeBlock NUMBER_FORMS
\r
1245 = new UnicodeBlock("NUMBER_FORMS", NUMBER_FORMS_ID);
\r
1249 public static final UnicodeBlock ARROWS
\r
1250 = new UnicodeBlock("ARROWS", ARROWS_ID);
\r
1254 public static final UnicodeBlock MATHEMATICAL_OPERATORS
\r
1255 = new UnicodeBlock("MATHEMATICAL_OPERATORS", MATHEMATICAL_OPERATORS_ID);
\r
1259 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
\r
1260 = new UnicodeBlock("MISCELLANEOUS_TECHNICAL", MISCELLANEOUS_TECHNICAL_ID);
\r
1264 public static final UnicodeBlock CONTROL_PICTURES
\r
1265 = new UnicodeBlock("CONTROL_PICTURES", CONTROL_PICTURES_ID);
\r
1269 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
\r
1270 = new UnicodeBlock("OPTICAL_CHARACTER_RECOGNITION", OPTICAL_CHARACTER_RECOGNITION_ID);
\r
1274 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
\r
1275 = new UnicodeBlock("ENCLOSED_ALPHANUMERICS", ENCLOSED_ALPHANUMERICS_ID);
\r
1279 public static final UnicodeBlock BOX_DRAWING
\r
1280 = new UnicodeBlock("BOX_DRAWING", BOX_DRAWING_ID);
\r
1284 public static final UnicodeBlock BLOCK_ELEMENTS
\r
1285 = new UnicodeBlock("BLOCK_ELEMENTS", BLOCK_ELEMENTS_ID);
\r
1289 public static final UnicodeBlock GEOMETRIC_SHAPES
\r
1290 = new UnicodeBlock("GEOMETRIC_SHAPES", GEOMETRIC_SHAPES_ID);
\r
1294 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
\r
1295 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS", MISCELLANEOUS_SYMBOLS_ID);
\r
1299 public static final UnicodeBlock DINGBATS
\r
1300 = new UnicodeBlock("DINGBATS", DINGBATS_ID);
\r
1304 public static final UnicodeBlock BRAILLE_PATTERNS
\r
1305 = new UnicodeBlock("BRAILLE_PATTERNS", BRAILLE_PATTERNS_ID);
\r
1309 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
\r
1310 = new UnicodeBlock("CJK_RADICALS_SUPPLEMENT", CJK_RADICALS_SUPPLEMENT_ID);
\r
1314 public static final UnicodeBlock KANGXI_RADICALS
\r
1315 = new UnicodeBlock("KANGXI_RADICALS", KANGXI_RADICALS_ID);
\r
1319 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
\r
1320 = new UnicodeBlock("IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
\r
1321 IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID);
\r
1325 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
\r
1326 = new UnicodeBlock("CJK_SYMBOLS_AND_PUNCTUATION", CJK_SYMBOLS_AND_PUNCTUATION_ID);
\r
1330 public static final UnicodeBlock HIRAGANA
\r
1331 = new UnicodeBlock("HIRAGANA", HIRAGANA_ID);
\r
1335 public static final UnicodeBlock KATAKANA
\r
1336 = new UnicodeBlock("KATAKANA", KATAKANA_ID);
\r
1340 public static final UnicodeBlock BOPOMOFO
\r
1341 = new UnicodeBlock("BOPOMOFO", BOPOMOFO_ID);
\r
1345 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
\r
1346 = new UnicodeBlock("HANGUL_COMPATIBILITY_JAMO", HANGUL_COMPATIBILITY_JAMO_ID);
\r
1350 public static final UnicodeBlock KANBUN
\r
1351 = new UnicodeBlock("KANBUN", KANBUN_ID);
\r
1355 public static final UnicodeBlock BOPOMOFO_EXTENDED
\r
1356 = new UnicodeBlock("BOPOMOFO_EXTENDED", BOPOMOFO_EXTENDED_ID);
\r
1360 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
\r
1361 = new UnicodeBlock("ENCLOSED_CJK_LETTERS_AND_MONTHS",
\r
1362 ENCLOSED_CJK_LETTERS_AND_MONTHS_ID);
\r
1366 public static final UnicodeBlock CJK_COMPATIBILITY
\r
1367 = new UnicodeBlock("CJK_COMPATIBILITY", CJK_COMPATIBILITY_ID);
\r
1371 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
\r
1372 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
\r
1373 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID);
\r
1377 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
\r
1378 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS", CJK_UNIFIED_IDEOGRAPHS_ID);
\r
1382 public static final UnicodeBlock YI_SYLLABLES
\r
1383 = new UnicodeBlock("YI_SYLLABLES", YI_SYLLABLES_ID);
\r
1387 public static final UnicodeBlock YI_RADICALS
\r
1388 = new UnicodeBlock("YI_RADICALS", YI_RADICALS_ID);
\r
1392 public static final UnicodeBlock HANGUL_SYLLABLES
\r
1393 = new UnicodeBlock("HANGUL_SYLLABLES", HANGUL_SYLLABLES_ID);
\r
1397 public static final UnicodeBlock HIGH_SURROGATES
\r
1398 = new UnicodeBlock("HIGH_SURROGATES", HIGH_SURROGATES_ID);
\r
1402 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
\r
1403 = new UnicodeBlock("HIGH_PRIVATE_USE_SURROGATES", HIGH_PRIVATE_USE_SURROGATES_ID);
\r
1407 public static final UnicodeBlock LOW_SURROGATES
\r
1408 = new UnicodeBlock("LOW_SURROGATES", LOW_SURROGATES_ID);
\r
1410 * Same as public static final int PRIVATE_USE.
\r
1411 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
1412 * and multiple code point ranges had this block.
\r
1413 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
1414 * and adds separate blocks for the supplementary PUAs.
\r
1417 public static final UnicodeBlock PRIVATE_USE_AREA
\r
1418 = new UnicodeBlock("PRIVATE_USE_AREA", 78);
\r
1420 * Same as public static final int PRIVATE_USE_AREA.
\r
1421 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
1422 * and multiple code point ranges had this block.
\r
1423 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
1424 * and adds separate blocks for the supplementary PUAs.
\r
1427 public static final UnicodeBlock PRIVATE_USE
\r
1428 = PRIVATE_USE_AREA;
\r
1432 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
\r
1433 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS", CJK_COMPATIBILITY_IDEOGRAPHS_ID);
\r
1437 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
\r
1438 = new UnicodeBlock("ALPHABETIC_PRESENTATION_FORMS", ALPHABETIC_PRESENTATION_FORMS_ID);
\r
1442 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
\r
1443 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_A", ARABIC_PRESENTATION_FORMS_A_ID);
\r
1447 public static final UnicodeBlock COMBINING_HALF_MARKS
\r
1448 = new UnicodeBlock("COMBINING_HALF_MARKS", COMBINING_HALF_MARKS_ID);
\r
1452 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
\r
1453 = new UnicodeBlock("CJK_COMPATIBILITY_FORMS", CJK_COMPATIBILITY_FORMS_ID);
\r
1457 public static final UnicodeBlock SMALL_FORM_VARIANTS
\r
1458 = new UnicodeBlock("SMALL_FORM_VARIANTS", SMALL_FORM_VARIANTS_ID);
\r
1462 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
\r
1463 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_B", ARABIC_PRESENTATION_FORMS_B_ID);
\r
1467 public static final UnicodeBlock SPECIALS
\r
1468 = new UnicodeBlock("SPECIALS", SPECIALS_ID);
\r
1472 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
\r
1473 = new UnicodeBlock("HALFWIDTH_AND_FULLWIDTH_FORMS", HALFWIDTH_AND_FULLWIDTH_FORMS_ID);
\r
1477 public static final UnicodeBlock OLD_ITALIC
\r
1478 = new UnicodeBlock("OLD_ITALIC", OLD_ITALIC_ID);
\r
1482 public static final UnicodeBlock GOTHIC
\r
1483 = new UnicodeBlock("GOTHIC", GOTHIC_ID);
\r
1487 public static final UnicodeBlock DESERET
\r
1488 = new UnicodeBlock("DESERET", DESERET_ID);
\r
1492 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
\r
1493 = new UnicodeBlock("BYZANTINE_MUSICAL_SYMBOLS", BYZANTINE_MUSICAL_SYMBOLS_ID);
\r
1497 public static final UnicodeBlock MUSICAL_SYMBOLS
\r
1498 = new UnicodeBlock("MUSICAL_SYMBOLS", MUSICAL_SYMBOLS_ID);
\r
1502 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
\r
1503 = new UnicodeBlock("MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
\r
1504 MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID);
\r
1508 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
\r
1509 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
\r
1510 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID);
\r
1514 public static final UnicodeBlock
\r
1515 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
\r
1516 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
\r
1517 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID);
\r
1521 public static final UnicodeBlock TAGS
\r
1522 = new UnicodeBlock("TAGS", TAGS_ID);
\r
1524 // New blocks in Unicode 3.2
\r
1527 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
1530 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
\r
1531 = new UnicodeBlock("CYRILLIC_SUPPLEMENTARY", CYRILLIC_SUPPLEMENTARY_ID);
\r
1533 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
1536 public static final UnicodeBlock CYRILLIC_SUPPLEMENT
\r
1537 = new UnicodeBlock("CYRILLIC_SUPPLEMENT", CYRILLIC_SUPPLEMENT_ID);
\r
1541 public static final UnicodeBlock TAGALOG
\r
1542 = new UnicodeBlock("TAGALOG", TAGALOG_ID);
\r
1546 public static final UnicodeBlock HANUNOO
\r
1547 = new UnicodeBlock("HANUNOO", HANUNOO_ID);
\r
1551 public static final UnicodeBlock BUHID
\r
1552 = new UnicodeBlock("BUHID", BUHID_ID);
\r
1556 public static final UnicodeBlock TAGBANWA
\r
1557 = new UnicodeBlock("TAGBANWA", TAGBANWA_ID);
\r
1561 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
\r
1562 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
\r
1563 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID);
\r
1567 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
\r
1568 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_A", SUPPLEMENTAL_ARROWS_A_ID);
\r
1572 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
\r
1573 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_B", SUPPLEMENTAL_ARROWS_B_ID);
\r
1577 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
\r
1578 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
\r
1579 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID);
\r
1583 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
\r
1584 = new UnicodeBlock("SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
\r
1585 SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID);
\r
1589 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
\r
1590 = new UnicodeBlock("KATAKANA_PHONETIC_EXTENSIONS", KATAKANA_PHONETIC_EXTENSIONS_ID);
\r
1594 public static final UnicodeBlock VARIATION_SELECTORS
\r
1595 = new UnicodeBlock("VARIATION_SELECTORS", VARIATION_SELECTORS_ID);
\r
1599 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
\r
1600 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_A",
\r
1601 SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID);
\r
1605 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
\r
1606 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_B",
\r
1607 SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID);
\r
1612 public static final UnicodeBlock LIMBU
\r
1613 = new UnicodeBlock("LIMBU", LIMBU_ID);
\r
1617 public static final UnicodeBlock TAI_LE
\r
1618 = new UnicodeBlock("TAI_LE", TAI_LE_ID);
\r
1622 public static final UnicodeBlock KHMER_SYMBOLS
\r
1623 = new UnicodeBlock("KHMER_SYMBOLS", KHMER_SYMBOLS_ID);
\r
1628 public static final UnicodeBlock PHONETIC_EXTENSIONS
\r
1629 = new UnicodeBlock("PHONETIC_EXTENSIONS", PHONETIC_EXTENSIONS_ID);
\r
1634 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
\r
1635 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS_AND_ARROWS",
\r
1636 MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID);
\r
1640 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
\r
1641 = new UnicodeBlock("YIJING_HEXAGRAM_SYMBOLS", YIJING_HEXAGRAM_SYMBOLS_ID);
\r
1645 public static final UnicodeBlock LINEAR_B_SYLLABARY
\r
1646 = new UnicodeBlock("LINEAR_B_SYLLABARY", LINEAR_B_SYLLABARY_ID);
\r
1650 public static final UnicodeBlock LINEAR_B_IDEOGRAMS
\r
1651 = new UnicodeBlock("LINEAR_B_IDEOGRAMS", LINEAR_B_IDEOGRAMS_ID);
\r
1655 public static final UnicodeBlock AEGEAN_NUMBERS
\r
1656 = new UnicodeBlock("AEGEAN_NUMBERS", AEGEAN_NUMBERS_ID);
\r
1660 public static final UnicodeBlock UGARITIC
\r
1661 = new UnicodeBlock("UGARITIC", UGARITIC_ID);
\r
1665 public static final UnicodeBlock SHAVIAN
\r
1666 = new UnicodeBlock("SHAVIAN", SHAVIAN_ID);
\r
1670 public static final UnicodeBlock OSMANYA
\r
1671 = new UnicodeBlock("OSMANYA", OSMANYA_ID);
\r
1675 public static final UnicodeBlock CYPRIOT_SYLLABARY
\r
1676 = new UnicodeBlock("CYPRIOT_SYLLABARY", CYPRIOT_SYLLABARY_ID);
\r
1680 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
\r
1681 = new UnicodeBlock("TAI_XUAN_JING_SYMBOLS", TAI_XUAN_JING_SYMBOLS_ID);
\r
1686 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
\r
1687 = new UnicodeBlock("VARIATION_SELECTORS_SUPPLEMENT", VARIATION_SELECTORS_SUPPLEMENT_ID);
\r
1689 /* New blocks in Unicode 4.1 */
\r
1694 public static final UnicodeBlock ANCIENT_GREEK_MUSICAL_NOTATION =
\r
1695 new UnicodeBlock("ANCIENT_GREEK_MUSICAL_NOTATION",
\r
1696 ANCIENT_GREEK_MUSICAL_NOTATION_ID); /*[1D200]*/
\r
1701 public static final UnicodeBlock ANCIENT_GREEK_NUMBERS =
\r
1702 new UnicodeBlock("ANCIENT_GREEK_NUMBERS", ANCIENT_GREEK_NUMBERS_ID); /*[10140]*/
\r
1707 public static final UnicodeBlock ARABIC_SUPPLEMENT =
\r
1708 new UnicodeBlock("ARABIC_SUPPLEMENT", ARABIC_SUPPLEMENT_ID); /*[0750]*/
\r
1713 public static final UnicodeBlock BUGINESE =
\r
1714 new UnicodeBlock("BUGINESE", BUGINESE_ID); /*[1A00]*/
\r
1719 public static final UnicodeBlock CJK_STROKES =
\r
1720 new UnicodeBlock("CJK_STROKES", CJK_STROKES_ID); /*[31C0]*/
\r
1725 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_SUPPLEMENT =
\r
1726 new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_SUPPLEMENT",
\r
1727 COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID); /*[1DC0]*/
\r
1732 public static final UnicodeBlock COPTIC = new UnicodeBlock("COPTIC", COPTIC_ID); /*[2C80]*/
\r
1737 public static final UnicodeBlock ETHIOPIC_EXTENDED =
\r
1738 new UnicodeBlock("ETHIOPIC_EXTENDED", ETHIOPIC_EXTENDED_ID); /*[2D80]*/
\r
1743 public static final UnicodeBlock ETHIOPIC_SUPPLEMENT =
\r
1744 new UnicodeBlock("ETHIOPIC_SUPPLEMENT", ETHIOPIC_SUPPLEMENT_ID); /*[1380]*/
\r
1749 public static final UnicodeBlock GEORGIAN_SUPPLEMENT =
\r
1750 new UnicodeBlock("GEORGIAN_SUPPLEMENT", GEORGIAN_SUPPLEMENT_ID); /*[2D00]*/
\r
1755 public static final UnicodeBlock GLAGOLITIC =
\r
1756 new UnicodeBlock("GLAGOLITIC", GLAGOLITIC_ID); /*[2C00]*/
\r
1761 public static final UnicodeBlock KHAROSHTHI =
\r
1762 new UnicodeBlock("KHAROSHTHI", KHAROSHTHI_ID); /*[10A00]*/
\r
1767 public static final UnicodeBlock MODIFIER_TONE_LETTERS =
\r
1768 new UnicodeBlock("MODIFIER_TONE_LETTERS", MODIFIER_TONE_LETTERS_ID); /*[A700]*/
\r
1773 public static final UnicodeBlock NEW_TAI_LUE =
\r
1774 new UnicodeBlock("NEW_TAI_LUE", NEW_TAI_LUE_ID); /*[1980]*/
\r
1779 public static final UnicodeBlock OLD_PERSIAN =
\r
1780 new UnicodeBlock("OLD_PERSIAN", OLD_PERSIAN_ID); /*[103A0]*/
\r
1785 public static final UnicodeBlock PHONETIC_EXTENSIONS_SUPPLEMENT =
\r
1786 new UnicodeBlock("PHONETIC_EXTENSIONS_SUPPLEMENT",
\r
1787 PHONETIC_EXTENSIONS_SUPPLEMENT_ID); /*[1D80]*/
\r
1792 public static final UnicodeBlock SUPPLEMENTAL_PUNCTUATION =
\r
1793 new UnicodeBlock("SUPPLEMENTAL_PUNCTUATION", SUPPLEMENTAL_PUNCTUATION_ID); /*[2E00]*/
\r
1798 public static final UnicodeBlock SYLOTI_NAGRI =
\r
1799 new UnicodeBlock("SYLOTI_NAGRI", SYLOTI_NAGRI_ID); /*[A800]*/
\r
1804 public static final UnicodeBlock TIFINAGH =
\r
1805 new UnicodeBlock("TIFINAGH", TIFINAGH_ID); /*[2D30]*/
\r
1810 public static final UnicodeBlock VERTICAL_FORMS =
\r
1811 new UnicodeBlock("VERTICAL_FORMS", VERTICAL_FORMS_ID); /*[FE10]*/
\r
1816 public static final UnicodeBlock NKO = new UnicodeBlock("NKO", NKO_ID); /*[07C0]*/
\r
1820 public static final UnicodeBlock BALINESE =
\r
1821 new UnicodeBlock("BALINESE", BALINESE_ID); /*[1B00]*/
\r
1825 public static final UnicodeBlock LATIN_EXTENDED_C =
\r
1826 new UnicodeBlock("LATIN_EXTENDED_C", LATIN_EXTENDED_C_ID); /*[2C60]*/
\r
1830 public static final UnicodeBlock LATIN_EXTENDED_D =
\r
1831 new UnicodeBlock("LATIN_EXTENDED_D", LATIN_EXTENDED_D_ID); /*[A720]*/
\r
1835 public static final UnicodeBlock PHAGS_PA =
\r
1836 new UnicodeBlock("PHAGS_PA", PHAGS_PA_ID); /*[A840]*/
\r
1840 public static final UnicodeBlock PHOENICIAN =
\r
1841 new UnicodeBlock("PHOENICIAN", PHOENICIAN_ID); /*[10900]*/
\r
1845 public static final UnicodeBlock CUNEIFORM =
\r
1846 new UnicodeBlock("CUNEIFORM", CUNEIFORM_ID); /*[12000]*/
\r
1850 public static final UnicodeBlock CUNEIFORM_NUMBERS_AND_PUNCTUATION =
\r
1851 new UnicodeBlock("CUNEIFORM_NUMBERS_AND_PUNCTUATION",
\r
1852 CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID); /*[12400]*/
\r
1856 public static final UnicodeBlock COUNTING_ROD_NUMERALS =
\r
1857 new UnicodeBlock("COUNTING_ROD_NUMERALS", COUNTING_ROD_NUMERALS_ID); /*[1D360]*/
\r
1862 public static final UnicodeBlock SUNDANESE =
\r
1863 new UnicodeBlock("SUNDANESE", SUNDANESE_ID); /* [1B80] */
\r
1868 public static final UnicodeBlock LEPCHA =
\r
1869 new UnicodeBlock("LEPCHA", LEPCHA_ID); /* [1C00] */
\r
1874 public static final UnicodeBlock OL_CHIKI =
\r
1875 new UnicodeBlock("OL_CHIKI", OL_CHIKI_ID); /* [1C50] */
\r
1880 public static final UnicodeBlock CYRILLIC_EXTENDED_A =
\r
1881 new UnicodeBlock("CYRILLIC_EXTENDED_A", CYRILLIC_EXTENDED_A_ID); /* [2DE0] */
\r
1886 public static final UnicodeBlock VAI = new UnicodeBlock("VAI", VAI_ID); /* [A500] */
\r
1891 public static final UnicodeBlock CYRILLIC_EXTENDED_B =
\r
1892 new UnicodeBlock("CYRILLIC_EXTENDED_B", CYRILLIC_EXTENDED_B_ID); /* [A640] */
\r
1897 public static final UnicodeBlock SAURASHTRA =
\r
1898 new UnicodeBlock("SAURASHTRA", SAURASHTRA_ID); /* [A880] */
\r
1903 public static final UnicodeBlock KAYAH_LI =
\r
1904 new UnicodeBlock("KAYAH_LI", KAYAH_LI_ID); /* [A900] */
\r
1909 public static final UnicodeBlock REJANG =
\r
1910 new UnicodeBlock("REJANG", REJANG_ID); /* [A930] */
\r
1915 public static final UnicodeBlock CHAM =
\r
1916 new UnicodeBlock("CHAM", CHAM_ID); /* [AA00] */
\r
1921 public static final UnicodeBlock ANCIENT_SYMBOLS =
\r
1922 new UnicodeBlock("ANCIENT_SYMBOLS", ANCIENT_SYMBOLS_ID); /* [10190] */
\r
1927 public static final UnicodeBlock PHAISTOS_DISC =
\r
1928 new UnicodeBlock("PHAISTOS_DISC", PHAISTOS_DISC_ID); /* [101D0] */
\r
1933 public static final UnicodeBlock LYCIAN =
\r
1934 new UnicodeBlock("LYCIAN", LYCIAN_ID); /* [10280] */
\r
1939 public static final UnicodeBlock CARIAN =
\r
1940 new UnicodeBlock("CARIAN", CARIAN_ID); /* [102A0] */
\r
1945 public static final UnicodeBlock LYDIAN =
\r
1946 new UnicodeBlock("LYDIAN", LYDIAN_ID); /* [10920] */
\r
1951 public static final UnicodeBlock MAHJONG_TILES =
\r
1952 new UnicodeBlock("MAHJONG_TILES", MAHJONG_TILES_ID); /* [1F000] */
\r
1957 public static final UnicodeBlock DOMINO_TILES =
\r
1958 new UnicodeBlock("DOMINO_TILES", DOMINO_TILES_ID); /* [1F030] */
\r
1960 /* New blocks in Unicode 5.2 */
\r
1962 /** @stable ICU 4.4 */
\r
1963 public static final UnicodeBlock SAMARITAN =
\r
1964 new UnicodeBlock("SAMARITAN", SAMARITAN_ID); /*[0800]*/
\r
1965 /** @stable ICU 4.4 */
\r
1966 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED =
\r
1967 new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED",
\r
1968 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_ID); /*[18B0]*/
\r
1969 /** @stable ICU 4.4 */
\r
1970 public static final UnicodeBlock TAI_THAM =
\r
1971 new UnicodeBlock("TAI_THAM", TAI_THAM_ID); /*[1A20]*/
\r
1972 /** @stable ICU 4.4 */
\r
1973 public static final UnicodeBlock VEDIC_EXTENSIONS =
\r
1974 new UnicodeBlock("VEDIC_EXTENSIONS", VEDIC_EXTENSIONS_ID); /*[1CD0]*/
\r
1975 /** @stable ICU 4.4 */
\r
1976 public static final UnicodeBlock LISU =
\r
1977 new UnicodeBlock("LISU", LISU_ID); /*[A4D0]*/
\r
1978 /** @stable ICU 4.4 */
\r
1979 public static final UnicodeBlock BAMUM =
\r
1980 new UnicodeBlock("BAMUM", BAMUM_ID); /*[A6A0]*/
\r
1981 /** @stable ICU 4.4 */
\r
1982 public static final UnicodeBlock COMMON_INDIC_NUMBER_FORMS =
\r
1983 new UnicodeBlock("COMMON_INDIC_NUMBER_FORMS", COMMON_INDIC_NUMBER_FORMS_ID); /*[A830]*/
\r
1984 /** @stable ICU 4.4 */
\r
1985 public static final UnicodeBlock DEVANAGARI_EXTENDED =
\r
1986 new UnicodeBlock("DEVANAGARI_EXTENDED", DEVANAGARI_EXTENDED_ID); /*[A8E0]*/
\r
1987 /** @stable ICU 4.4 */
\r
1988 public static final UnicodeBlock HANGUL_JAMO_EXTENDED_A =
\r
1989 new UnicodeBlock("HANGUL_JAMO_EXTENDED_A", HANGUL_JAMO_EXTENDED_A_ID); /*[A960]*/
\r
1990 /** @stable ICU 4.4 */
\r
1991 public static final UnicodeBlock JAVANESE =
\r
1992 new UnicodeBlock("JAVANESE", JAVANESE_ID); /*[A980]*/
\r
1993 /** @stable ICU 4.4 */
\r
1994 public static final UnicodeBlock MYANMAR_EXTENDED_A =
\r
1995 new UnicodeBlock("MYANMAR_EXTENDED_A", MYANMAR_EXTENDED_A_ID); /*[AA60]*/
\r
1996 /** @stable ICU 4.4 */
\r
1997 public static final UnicodeBlock TAI_VIET =
\r
1998 new UnicodeBlock("TAI_VIET", TAI_VIET_ID); /*[AA80]*/
\r
1999 /** @stable ICU 4.4 */
\r
2000 public static final UnicodeBlock MEETEI_MAYEK =
\r
2001 new UnicodeBlock("MEETEI_MAYEK", MEETEI_MAYEK_ID); /*[ABC0]*/
\r
2002 /** @stable ICU 4.4 */
\r
2003 public static final UnicodeBlock HANGUL_JAMO_EXTENDED_B =
\r
2004 new UnicodeBlock("HANGUL_JAMO_EXTENDED_B", HANGUL_JAMO_EXTENDED_B_ID); /*[D7B0]*/
\r
2005 /** @stable ICU 4.4 */
\r
2006 public static final UnicodeBlock IMPERIAL_ARAMAIC =
\r
2007 new UnicodeBlock("IMPERIAL_ARAMAIC", IMPERIAL_ARAMAIC_ID); /*[10840]*/
\r
2008 /** @stable ICU 4.4 */
\r
2009 public static final UnicodeBlock OLD_SOUTH_ARABIAN =
\r
2010 new UnicodeBlock("OLD_SOUTH_ARABIAN", OLD_SOUTH_ARABIAN_ID); /*[10A60]*/
\r
2011 /** @stable ICU 4.4 */
\r
2012 public static final UnicodeBlock AVESTAN =
\r
2013 new UnicodeBlock("AVESTAN", AVESTAN_ID); /*[10B00]*/
\r
2014 /** @stable ICU 4.4 */
\r
2015 public static final UnicodeBlock INSCRIPTIONAL_PARTHIAN =
\r
2016 new UnicodeBlock("INSCRIPTIONAL_PARTHIAN", INSCRIPTIONAL_PARTHIAN_ID); /*[10B40]*/
\r
2017 /** @stable ICU 4.4 */
\r
2018 public static final UnicodeBlock INSCRIPTIONAL_PAHLAVI =
\r
2019 new UnicodeBlock("INSCRIPTIONAL_PAHLAVI", INSCRIPTIONAL_PAHLAVI_ID); /*[10B60]*/
\r
2020 /** @stable ICU 4.4 */
\r
2021 public static final UnicodeBlock OLD_TURKIC =
\r
2022 new UnicodeBlock("OLD_TURKIC", OLD_TURKIC_ID); /*[10C00]*/
\r
2023 /** @stable ICU 4.4 */
\r
2024 public static final UnicodeBlock RUMI_NUMERAL_SYMBOLS =
\r
2025 new UnicodeBlock("RUMI_NUMERAL_SYMBOLS", RUMI_NUMERAL_SYMBOLS_ID); /*[10E60]*/
\r
2026 /** @stable ICU 4.4 */
\r
2027 public static final UnicodeBlock KAITHI =
\r
2028 new UnicodeBlock("KAITHI", KAITHI_ID); /*[11080]*/
\r
2029 /** @stable ICU 4.4 */
\r
2030 public static final UnicodeBlock EGYPTIAN_HIEROGLYPHS =
\r
2031 new UnicodeBlock("EGYPTIAN_HIEROGLYPHS", EGYPTIAN_HIEROGLYPHS_ID); /*[13000]*/
\r
2032 /** @stable ICU 4.4 */
\r
2033 public static final UnicodeBlock ENCLOSED_ALPHANUMERIC_SUPPLEMENT =
\r
2034 new UnicodeBlock("ENCLOSED_ALPHANUMERIC_SUPPLEMENT",
\r
2035 ENCLOSED_ALPHANUMERIC_SUPPLEMENT_ID); /*[1F100]*/
\r
2036 /** @stable ICU 4.4 */
\r
2037 public static final UnicodeBlock ENCLOSED_IDEOGRAPHIC_SUPPLEMENT =
\r
2038 new UnicodeBlock("ENCLOSED_IDEOGRAPHIC_SUPPLEMENT",
\r
2039 ENCLOSED_IDEOGRAPHIC_SUPPLEMENT_ID); /*[1F200]*/
\r
2040 /** @stable ICU 4.4 */
\r
2041 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C =
\r
2042 new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C",
\r
2043 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_ID); /*[2A700]*/
\r
2048 public static final UnicodeBlock INVALID_CODE
\r
2049 = new UnicodeBlock("INVALID_CODE", INVALID_CODE_ID);
\r
2052 for (int blockId = 0; blockId < COUNT; ++blockId) {
\r
2053 if (BLOCKS_[blockId] == null) {
\r
2054 throw new java.lang.IllegalStateException(
\r
2055 "UnicodeBlock.BLOCKS_[" + blockId + "] not initialized");
\r
2060 // public methods --------------------------------------------------
\r
2063 * {@icu} Returns the only instance of the UnicodeBlock with the argument ID.
\r
2064 * If no such ID exists, a INVALID_CODE UnicodeBlock will be returned.
\r
2065 * @param id UnicodeBlock ID
\r
2066 * @return the only instance of the UnicodeBlock with the argument ID
\r
2067 * if it exists, otherwise a INVALID_CODE UnicodeBlock will be
\r
2071 public static UnicodeBlock getInstance(int id)
\r
2073 if (id >= 0 && id < BLOCKS_.length) {
\r
2074 return BLOCKS_[id];
\r
2076 return INVALID_CODE;
\r
2080 * Returns the Unicode allocation block that contains the code point,
\r
2081 * or null if the code point is not a member of a defined block.
\r
2082 * @param ch code point to be tested
\r
2083 * @return the Unicode allocation block that contains the code point
\r
2086 public static UnicodeBlock of(int ch)
\r
2088 if (ch > MAX_VALUE) {
\r
2089 return INVALID_CODE;
\r
2092 return UnicodeBlock.getInstance((UCharacterProperty.INSTANCE.getAdditional(ch, 0)
\r
2093 & BLOCK_MASK_) >> BLOCK_SHIFT_);
\r
2097 * Internal function returning of(ch).getID().
\r
2100 * @return numeric block value
\r
2102 static int idOf(int ch) {
\r
2103 if (ch < 0 || ch > MAX_VALUE) {
\r
2107 return (UCharacterProperty.INSTANCE.getAdditional(ch, 0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
\r
2111 * Cover the JDK 1.5 API. Return the Unicode block with the
\r
2112 * given name. {@icunote} Unlike JDK 1.5, this only matches
\r
2113 * against the official UCD name and the Java block name
\r
2114 * (ignoring case).
\r
2115 * @param blockName the name of the block to match
\r
2116 * @return the UnicodeBlock with that name
\r
2117 * @throws IllegalArgumentException if the blockName could not be matched
\r
2120 public static final UnicodeBlock forName(String blockName) {
\r
2121 Map<String, UnicodeBlock> m = null;
\r
2122 if (mref != null) {
\r
2126 m = new HashMap<String, UnicodeBlock>(BLOCKS_.length);
\r
2127 for (int i = 0; i < BLOCKS_.length; ++i) {
\r
2128 UnicodeBlock b = BLOCKS_[i];
\r
2129 String name = trimBlockName(
\r
2130 getPropertyValueName(UProperty.BLOCK, b.getID(),
\r
2131 UProperty.NameChoice.LONG));
\r
2134 mref = new SoftReference<Map<String, UnicodeBlock>>(m);
\r
2136 UnicodeBlock b = m.get(trimBlockName(blockName));
\r
2138 throw new IllegalArgumentException();
\r
2142 private static SoftReference<Map<String, UnicodeBlock>> mref;
\r
2144 private static String trimBlockName(String name) {
\r
2145 String upper = name.toUpperCase();
\r
2146 StringBuilder result = new StringBuilder(upper.length());
\r
2147 for (int i = 0; i < upper.length(); i++) {
\r
2148 char c = upper.charAt(i);
\r
2149 if (c != ' ' && c != '_' && c != '-') {
\r
2153 return result.toString();
\r
2157 * {icu} Returns the type ID of this Unicode block
\r
2158 * @return integer type ID of this Unicode block
\r
2161 public int getID()
\r
2166 // private data members ---------------------------------------------
\r
2169 * Identification code for this UnicodeBlock
\r
2171 private int m_id_;
\r
2173 // private constructor ----------------------------------------------
\r
2176 * UnicodeBlock constructor
\r
2177 * @param name name of this UnicodeBlock
\r
2178 * @param id unique id of this UnicodeBlock
\r
2179 * @exception NullPointerException if name is <code>null</code>
\r
2181 private UnicodeBlock(String name, int id)
\r
2186 BLOCKS_[id] = this;
\r
2192 * East Asian Width constants.
\r
2193 * @see UProperty#EAST_ASIAN_WIDTH
\r
2194 * @see UCharacter#getIntPropertyValue
\r
2197 public static interface EastAsianWidth
\r
2202 public static final int NEUTRAL = 0;
\r
2206 public static final int AMBIGUOUS = 1;
\r
2210 public static final int HALFWIDTH = 2;
\r
2214 public static final int FULLWIDTH = 3;
\r
2218 public static final int NARROW = 4;
\r
2222 public static final int WIDE = 5;
\r
2226 public static final int COUNT = 6;
\r
2230 * Decomposition Type constants.
\r
2231 * @see UProperty#DECOMPOSITION_TYPE
\r
2234 public static interface DecompositionType
\r
2239 public static final int NONE = 0;
\r
2243 public static final int CANONICAL = 1;
\r
2247 public static final int COMPAT = 2;
\r
2251 public static final int CIRCLE = 3;
\r
2255 public static final int FINAL = 4;
\r
2259 public static final int FONT = 5;
\r
2263 public static final int FRACTION = 6;
\r
2267 public static final int INITIAL = 7;
\r
2271 public static final int ISOLATED = 8;
\r
2275 public static final int MEDIAL = 9;
\r
2279 public static final int NARROW = 10;
\r
2283 public static final int NOBREAK = 11;
\r
2287 public static final int SMALL = 12;
\r
2291 public static final int SQUARE = 13;
\r
2295 public static final int SUB = 14;
\r
2299 public static final int SUPER = 15;
\r
2303 public static final int VERTICAL = 16;
\r
2307 public static final int WIDE = 17;
\r
2311 public static final int COUNT = 18;
\r
2315 * Joining Type constants.
\r
2316 * @see UProperty#JOINING_TYPE
\r
2319 public static interface JoiningType
\r
2324 public static final int NON_JOINING = 0;
\r
2328 public static final int JOIN_CAUSING = 1;
\r
2332 public static final int DUAL_JOINING = 2;
\r
2336 public static final int LEFT_JOINING = 3;
\r
2340 public static final int RIGHT_JOINING = 4;
\r
2344 public static final int TRANSPARENT = 5;
\r
2348 public static final int COUNT = 6;
\r
2352 * Joining Group constants.
\r
2353 * @see UProperty#JOINING_GROUP
\r
2356 public static interface JoiningGroup
\r
2361 public static final int NO_JOINING_GROUP = 0;
\r
2365 public static final int AIN = 1;
\r
2369 public static final int ALAPH = 2;
\r
2373 public static final int ALEF = 3;
\r
2377 public static final int BEH = 4;
\r
2381 public static final int BETH = 5;
\r
2385 public static final int DAL = 6;
\r
2389 public static final int DALATH_RISH = 7;
\r
2393 public static final int E = 8;
\r
2397 public static final int FEH = 9;
\r
2401 public static final int FINAL_SEMKATH = 10;
\r
2405 public static final int GAF = 11;
\r
2409 public static final int GAMAL = 12;
\r
2413 public static final int HAH = 13;
\r
2417 public static final int HAMZA_ON_HEH_GOAL = 14;
\r
2421 public static final int HE = 15;
\r
2425 public static final int HEH = 16;
\r
2429 public static final int HEH_GOAL = 17;
\r
2433 public static final int HETH = 18;
\r
2437 public static final int KAF = 19;
\r
2441 public static final int KAPH = 20;
\r
2445 public static final int KNOTTED_HEH = 21;
\r
2449 public static final int LAM = 22;
\r
2453 public static final int LAMADH = 23;
\r
2457 public static final int MEEM = 24;
\r
2461 public static final int MIM = 25;
\r
2465 public static final int NOON = 26;
\r
2469 public static final int NUN = 27;
\r
2473 public static final int PE = 28;
\r
2477 public static final int QAF = 29;
\r
2481 public static final int QAPH = 30;
\r
2485 public static final int REH = 31;
\r
2489 public static final int REVERSED_PE = 32;
\r
2493 public static final int SAD = 33;
\r
2497 public static final int SADHE = 34;
\r
2501 public static final int SEEN = 35;
\r
2505 public static final int SEMKATH = 36;
\r
2509 public static final int SHIN = 37;
\r
2513 public static final int SWASH_KAF = 38;
\r
2517 public static final int SYRIAC_WAW = 39;
\r
2521 public static final int TAH = 40;
\r
2525 public static final int TAW = 41;
\r
2529 public static final int TEH_MARBUTA = 42;
\r
2533 public static final int TETH = 43;
\r
2537 public static final int WAW = 44;
\r
2541 public static final int YEH = 45;
\r
2545 public static final int YEH_BARREE = 46;
\r
2549 public static final int YEH_WITH_TAIL = 47;
\r
2553 public static final int YUDH = 48;
\r
2557 public static final int YUDH_HE = 49;
\r
2561 public static final int ZAIN = 50;
\r
2565 public static final int FE = 51;
\r
2569 public static final int KHAPH = 52;
\r
2573 public static final int ZHAIN = 53;
\r
2577 public static final int BURUSHASKI_YEH_BARREE = 54;
\r
2578 /** @stable ICU 4.4 */
\r
2579 public static final int FARSI_YEH = 55;
\r
2580 /** @stable ICU 4.4 */
\r
2581 public static final int NYA = 56;
\r
2585 public static final int COUNT = 57;
\r
2589 * Grapheme Cluster Break constants.
\r
2590 * @see UProperty#GRAPHEME_CLUSTER_BREAK
\r
2593 public static interface GraphemeClusterBreak {
\r
2597 public static final int OTHER = 0;
\r
2601 public static final int CONTROL = 1;
\r
2605 public static final int CR = 2;
\r
2609 public static final int EXTEND = 3;
\r
2613 public static final int L = 4;
\r
2617 public static final int LF = 5;
\r
2621 public static final int LV = 6;
\r
2625 public static final int LVT = 7;
\r
2629 public static final int T = 8;
\r
2633 public static final int V = 9;
\r
2637 public static final int SPACING_MARK = 10;
\r
2641 public static final int PREPEND = 11;
\r
2645 public static final int COUNT = 12;
\r
2649 * Word Break constants.
\r
2650 * @see UProperty#WORD_BREAK
\r
2653 public static interface WordBreak {
\r
2657 public static final int OTHER = 0;
\r
2661 public static final int ALETTER = 1;
\r
2665 public static final int FORMAT = 2;
\r
2669 public static final int KATAKANA = 3;
\r
2673 public static final int MIDLETTER = 4;
\r
2677 public static final int MIDNUM = 5;
\r
2681 public static final int NUMERIC = 6;
\r
2685 public static final int EXTENDNUMLET = 7;
\r
2689 public static final int CR = 8;
\r
2693 public static final int EXTEND = 9;
\r
2697 public static final int LF = 10;
\r
2701 public static final int MIDNUMLET = 11;
\r
2705 public static final int NEWLINE = 12;
\r
2709 public static final int COUNT = 13;
\r
2713 * Sentence Break constants.
\r
2714 * @see UProperty#SENTENCE_BREAK
\r
2717 public static interface SentenceBreak {
\r
2721 public static final int OTHER = 0;
\r
2725 public static final int ATERM = 1;
\r
2729 public static final int CLOSE = 2;
\r
2733 public static final int FORMAT = 3;
\r
2737 public static final int LOWER = 4;
\r
2741 public static final int NUMERIC = 5;
\r
2745 public static final int OLETTER = 6;
\r
2749 public static final int SEP = 7;
\r
2753 public static final int SP = 8;
\r
2757 public static final int STERM = 9;
\r
2761 public static final int UPPER = 10;
\r
2765 public static final int CR = 11;
\r
2769 public static final int EXTEND = 12;
\r
2773 public static final int LF = 13;
\r
2777 public static final int SCONTINUE = 14;
\r
2781 public static final int COUNT = 15;
\r
2785 * Line Break constants.
\r
2786 * @see UProperty#LINE_BREAK
\r
2789 public static interface LineBreak
\r
2794 public static final int UNKNOWN = 0;
\r
2798 public static final int AMBIGUOUS = 1;
\r
2802 public static final int ALPHABETIC = 2;
\r
2806 public static final int BREAK_BOTH = 3;
\r
2810 public static final int BREAK_AFTER = 4;
\r
2814 public static final int BREAK_BEFORE = 5;
\r
2818 public static final int MANDATORY_BREAK = 6;
\r
2822 public static final int CONTINGENT_BREAK = 7;
\r
2826 public static final int CLOSE_PUNCTUATION = 8;
\r
2830 public static final int COMBINING_MARK = 9;
\r
2834 public static final int CARRIAGE_RETURN = 10;
\r
2838 public static final int EXCLAMATION = 11;
\r
2842 public static final int GLUE = 12;
\r
2846 public static final int HYPHEN = 13;
\r
2850 public static final int IDEOGRAPHIC = 14;
\r
2852 * @see #INSEPARABLE
\r
2855 public static final int INSEPERABLE = 15;
\r
2857 * Renamed from the misspelled "inseperable" in Unicode 4.0.1.
\r
2860 public static final int INSEPARABLE = 15;
\r
2864 public static final int INFIX_NUMERIC = 16;
\r
2868 public static final int LINE_FEED = 17;
\r
2872 public static final int NONSTARTER = 18;
\r
2876 public static final int NUMERIC = 19;
\r
2880 public static final int OPEN_PUNCTUATION = 20;
\r
2884 public static final int POSTFIX_NUMERIC = 21;
\r
2888 public static final int PREFIX_NUMERIC = 22;
\r
2892 public static final int QUOTATION = 23;
\r
2896 public static final int COMPLEX_CONTEXT = 24;
\r
2900 public static final int SURROGATE = 25;
\r
2904 public static final int SPACE = 26;
\r
2908 public static final int BREAK_SYMBOLS = 27;
\r
2912 public static final int ZWSPACE = 28;
\r
2917 public static final int NEXT_LINE = 29; /*[NL]*/
\r
2919 /* from here on: new in Unicode 4/ICU 2.6 */
\r
2924 public static final int WORD_JOINER = 30; /*[WJ]*/
\r
2926 /* from here on: new in Unicode 4.1/ICU 3.4 */
\r
2931 public static final int H2 = 31;
\r
2935 public static final int H3 = 32;
\r
2939 public static final int JL = 33;
\r
2943 public static final int JT = 34;
\r
2947 public static final int JV = 35;
\r
2948 /** @stable ICU 4.4 */
\r
2949 public static final int CLOSE_PARENTHESIS = 36; /*[CP]*/
\r
2951 /* new in Unicode 5.2/ICU 4.4 */
\r
2956 public static final int COUNT = 37;
\r
2960 * Numeric Type constants.
\r
2961 * @see UProperty#NUMERIC_TYPE
\r
2964 public static interface NumericType
\r
2969 public static final int NONE = 0;
\r
2973 public static final int DECIMAL = 1;
\r
2977 public static final int DIGIT = 2;
\r
2981 public static final int NUMERIC = 3;
\r
2985 public static final int COUNT = 4;
\r
2989 * Hangul Syllable Type constants.
\r
2991 * @see UProperty#HANGUL_SYLLABLE_TYPE
\r
2994 public static interface HangulSyllableType
\r
2999 public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
\r
3003 public static final int LEADING_JAMO = 1; /*[L]*/
\r
3007 public static final int VOWEL_JAMO = 2; /*[V]*/
\r
3011 public static final int TRAILING_JAMO = 3; /*[T]*/
\r
3015 public static final int LV_SYLLABLE = 4; /*[LV]*/
\r
3019 public static final int LVT_SYLLABLE = 5; /*[LVT]*/
\r
3023 public static final int COUNT = 6;
\r
3026 // public data members -----------------------------------------------
\r
3029 * The lowest Unicode code point value.
\r
3032 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
\r
3035 * The highest Unicode code point value (scalar value) according to the
\r
3036 * Unicode Standard.
\r
3037 * This is a 21-bit value (21 bits, rounded up).<br>
\r
3038 * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
\r
3041 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
\r
3044 * The minimum value for Supplementary code points
\r
3047 public static final int SUPPLEMENTARY_MIN_VALUE =
\r
3048 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
3051 * Unicode value used when translating into Unicode encoding form and there
\r
3052 * is no existing character.
\r
3055 public static final int REPLACEMENT_CHAR = '\uFFFD';
\r
3058 * Special value that is returned by getUnicodeNumericValue(int) when no
\r
3059 * numeric value is defined for a code point.
\r
3061 * @see #getUnicodeNumericValue
\r
3063 public static final double NO_NUMERIC_VALUE = -123456789;
\r
3066 * Compatibility constant for Java Character's MIN_RADIX.
\r
3069 public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
\r
3072 * Compatibility constant for Java Character's MAX_RADIX.
\r
3075 public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
\r
3078 * Do not lowercase non-initial parts of words when titlecasing.
\r
3079 * Option bit for titlecasing APIs that take an options bit set.
\r
3081 * By default, titlecasing will titlecase the first cased character
\r
3082 * of a word and lowercase all other characters.
\r
3083 * With this option, the other characters will not be modified.
\r
3085 * @see #toTitleCase
\r
3088 public static final int TITLECASE_NO_LOWERCASE = 0x100;
\r
3091 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
\r
3092 * titlecase exactly the characters at breaks from the iterator.
\r
3093 * Option bit for titlecasing APIs that take an options bit set.
\r
3095 * By default, titlecasing will take each break iterator index,
\r
3096 * adjust it by looking for the next cased character, and titlecase that one.
\r
3097 * Other characters are lowercased.
\r
3099 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
\r
3101 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
\r
3102 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
\r
3103 * cased character F. If F exists, map F to default_title(F); then map each
\r
3104 * subsequent character C to default_lower(C).
\r
3106 * @see #toTitleCase
\r
3107 * @see #TITLECASE_NO_LOWERCASE
\r
3110 public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
\r
3112 // public methods ----------------------------------------------------
\r
3115 * Returnss the numeric value of a decimal digit code point.
\r
3116 * <br>This method observes the semantics of
\r
3117 * <code>java.lang.Character.digit()</code>. Note that this
\r
3118 * will return positive values for code points for which isDigit
\r
3119 * returns false, just like java.lang.Character.
\r
3120 * <br><em>Semantic Change:</em> In release 1.3.1 and
\r
3121 * prior, this did not treat the European letters as having a
\r
3122 * digit value, and also treated numeric letters and other numbers as
\r
3124 * This has been changed to conform to the java semantics.
\r
3125 * <br>A code point is a valid digit if and only if:
\r
3127 * <li>ch is a decimal digit or one of the european letters, and
\r
3128 * <li>the value of ch is less than the specified radix.
\r
3130 * @param ch the code point to query
\r
3131 * @param radix the radix
\r
3132 * @return the numeric value represented by the code point in the
\r
3133 * specified radix, or -1 if the code point is not a decimal digit
\r
3134 * or if its value is too large for the radix
\r
3137 public static int digit(int ch, int radix)
\r
3139 if (2 <= radix && radix <= 36) {
\r
3140 int value = digit(ch);
\r
3142 // ch is not a decimal digit, try latin letters
\r
3143 value = getEuropeanDigit(ch);
\r
3145 return (value < radix) ? value : -1;
\r
3147 return -1; // invalid radix
\r
3152 * Returnss the numeric value of a decimal digit code point.
\r
3153 * <br>This is a convenience overload of <code>digit(int, int)</code>
\r
3154 * that provides a decimal radix.
\r
3155 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
\r
3156 * treated numeric letters and other numbers as digits. This has
\r
3157 * been changed to conform to the java semantics.
\r
3158 * @param ch the code point to query
\r
3159 * @return the numeric value represented by the code point,
\r
3160 * or -1 if the code point is not a decimal digit or if its
\r
3161 * value is too large for a decimal radix
\r
3164 public static int digit(int ch)
\r
3166 int props = getProperty(ch);
\r
3167 int value = getNumericTypeValue(props) - NTV_DECIMAL_START_;
\r
3176 * Returns the numeric value of the code point as a nonnegative
\r
3178 * <br>If the code point does not have a numeric value, then -1 is returned.
\r
3180 * If the code point has a numeric value that cannot be represented as a
\r
3181 * nonnegative integer (for example, a fractional value), then -2 is
\r
3183 * @param ch the code point to query
\r
3184 * @return the numeric value of the code point, or -1 if it has no numeric
\r
3185 * value, or -2 if it has a numeric value that cannot be represented as a
\r
3186 * nonnegative integer
\r
3189 public static int getNumericValue(int ch)
\r
3191 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
\r
3192 int props = UCharacterProperty.INSTANCE.getProperty(ch);
\r
3193 int ntv = getNumericTypeValue(props);
\r
3195 if(ntv==NTV_NONE_) {
\r
3196 return getEuropeanDigit(ch);
\r
3197 } else if(ntv<NTV_DIGIT_START_) {
\r
3198 /* decimal digit */
\r
3199 return ntv-NTV_DECIMAL_START_;
\r
3200 } else if(ntv<NTV_NUMERIC_START_) {
\r
3202 return ntv-NTV_DIGIT_START_;
\r
3203 } else if(ntv<NTV_FRACTION_START_) {
\r
3204 /* small integer */
\r
3205 return ntv-NTV_NUMERIC_START_;
\r
3206 } else if(ntv<NTV_LARGE_START_) {
\r
3209 } else if(ntv<NTV_RESERVED_START_) {
\r
3210 /* large, single-significant-digit integer */
\r
3211 int mant=(ntv>>5)-14;
\r
3212 int exp=(ntv&0x1f)+2;
\r
3213 if(exp<9 || (exp==9 && mant<=2)) {
\r
3214 int numValue=mant;
\r
3229 * {@icu} Returns the numeric value for a Unicode code point as defined in the
\r
3230 * Unicode Character Database.</p>
\r
3231 * <p>A "double" return type is necessary because some numeric values are
\r
3232 * fractions, negative, or too large for int.</p>
\r
3233 * <p>For characters without any numeric values in the Unicode Character
\r
3234 * Database, this function will return NO_NUMERIC_VALUE.</p>
\r
3235 * <p><em>API Change:</em> In release 2.2 and prior, this API has a
\r
3236 * return type int and returns -1 when the argument ch does not have a
\r
3237 * corresponding numeric value. This has been changed to synch with ICU4C
\r
3239 * This corresponds to the ICU4C function u_getNumericValue.
\r
3240 * @param ch Code point to get the numeric value for.
\r
3241 * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
\r
3244 public static double getUnicodeNumericValue(int ch)
\r
3246 // equivalent to c version double u_getNumericValue(UChar32 c)
\r
3247 int props = UCharacterProperty.INSTANCE.getProperty(ch);
\r
3248 int ntv = getNumericTypeValue(props);
\r
3250 if(ntv==NTV_NONE_) {
\r
3251 return NO_NUMERIC_VALUE;
\r
3252 } else if(ntv<NTV_DIGIT_START_) {
\r
3253 /* decimal digit */
\r
3254 return ntv-NTV_DECIMAL_START_;
\r
3255 } else if(ntv<NTV_NUMERIC_START_) {
\r
3257 return ntv-NTV_DIGIT_START_;
\r
3258 } else if(ntv<NTV_FRACTION_START_) {
\r
3259 /* small integer */
\r
3260 return ntv-NTV_NUMERIC_START_;
\r
3261 } else if(ntv<NTV_LARGE_START_) {
\r
3263 int numerator=(ntv>>4)-12;
\r
3264 int denominator=(ntv&0xf)+1;
\r
3265 return (double)numerator/denominator;
\r
3266 } else if(ntv<NTV_RESERVED_START_) {
\r
3267 /* large, single-significant-digit integer */
\r
3269 int mant=(ntv>>5)-14;
\r
3270 int exp=(ntv&0x1f)+2;
\r
3273 /* multiply by 10^exp without math.h */
\r
3296 return NO_NUMERIC_VALUE;
\r
3301 * Compatibility override of Java deprecated method. This
\r
3302 * method will always remain deprecated.
\r
3303 * Same as java.lang.Character.isSpace().
\r
3304 * @param ch the code point
\r
3305 * @return true if the code point is a space character as
\r
3306 * defined by java.lang.Character.isSpace.
\r
3307 * @deprecated ICU 3.4 (Java)
\r
3309 public static boolean isSpace(int ch) {
\r
3310 return ch <= 0x20 &&
\r
3311 (ch == 0x20 || ch == 0x09 || ch == 0x0a || ch == 0x0c || ch == 0x0d);
\r
3315 * Returns a value indicating a code point's Unicode category.
\r
3316 * Up-to-date Unicode implementation of java.lang.Character.getType()
\r
3317 * except for the above mentioned code points that had their category
\r
3319 * Return results are constants from the interface
\r
3320 * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
\r
3321 * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
\r
3322 * those returned by java.lang.Character.getType. UCharacterCategory values
\r
3323 * match the ones used in ICU4C, while java.lang.Character type
\r
3324 * values, though similar, skip the value 17.</p>
\r
3325 * @param ch code point whose type is to be determined
\r
3326 * @return category which is a value of UCharacterCategory
\r
3329 public static int getType(int ch)
\r
3331 return getProperty(ch) & UCharacterProperty.TYPE_MASK;
\r
3335 * Determines if a code point has a defined meaning in the up-to-date
\r
3336 * Unicode standard.
\r
3337 * E.g. supplementary code points though allocated space are not defined in
\r
3338 * Unicode yet.<br>
\r
3339 * Up-to-date Unicode implementation of java.lang.Character.isDefined()
\r
3340 * @param ch code point to be determined if it is defined in the most
\r
3341 * current version of Unicode
\r
3342 * @return true if this code point is defined in unicode
\r
3345 public static boolean isDefined(int ch)
\r
3347 return getType(ch) != 0;
\r
3351 * Determines if a code point is a Java digit.
\r
3352 * <br>This method observes the semantics of
\r
3353 * <code>java.lang.Character.isDigit()</code>. It returns true for decimal
\r
3355 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this treated
\r
3356 * numeric letters and other numbers as digits.
\r
3357 * This has been changed to conform to the java semantics.
\r
3358 * @param ch code point to query
\r
3359 * @return true if this code point is a digit
\r
3362 public static boolean isDigit(int ch)
\r
3364 return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
\r
3368 * Determines if the specified code point is an ISO control character.
\r
3369 * A code point is considered to be an ISO control character if it is in
\r
3370 * the range \u0000 through \u001F or in the range \u007F through
\r
3372 * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
\r
3373 * @param ch code point to determine if it is an ISO control character
\r
3374 * @return true if code point is a ISO control character
\r
3377 public static boolean isISOControl(int ch)
\r
3379 return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
\r
3380 ((ch <= UNIT_SEPARATOR_) || (ch >= DELETE_));
\r
3384 * Determines if the specified code point is a letter.
\r
3385 * Up-to-date Unicode implementation of java.lang.Character.isLetter()
\r
3386 * @param ch code point to determine if it is a letter
\r
3387 * @return true if code point is a letter
\r
3390 public static boolean isLetter(int ch)
\r
3392 // if props == 0, it will just fall through and return false
\r
3393 return ((1 << getType(ch))
\r
3394 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3395 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3396 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3397 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3398 | (1 << UCharacterCategory.OTHER_LETTER))) != 0;
\r
3402 * Determines if the specified code point is a letter or digit.
\r
3403 * {@icunote} This method, unlike java.lang.Character does not regard the ascii
\r
3404 * characters 'A' - 'Z' and 'a' - 'z' as digits.
\r
3405 * @param ch code point to determine if it is a letter or a digit
\r
3406 * @return true if code point is a letter or a digit
\r
3409 public static boolean isLetterOrDigit(int ch)
\r
3411 return ((1 << getType(ch))
\r
3412 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3413 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3414 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3415 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3416 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3417 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER))) != 0;
\r
3421 * Compatibility override of Java deprecated method. This
\r
3422 * method will always remain deprecated. Delegates to
\r
3423 * java.lang.Character.isJavaIdentifierStart.
\r
3424 * @param cp the code point
\r
3425 * @return true if the code point can start a java identifier.
\r
3426 * @deprecated ICU 3.4 (Java)
\r
3428 public static boolean isJavaLetter(int cp) {
\r
3429 return isJavaIdentifierStart(cp);
\r
3433 * Compatibility override of Java deprecated method. This
\r
3434 * method will always remain deprecated. Delegates to
\r
3435 * java.lang.Character.isJavaIdentifierPart.
\r
3436 * @param cp the code point
\r
3437 * @return true if the code point can continue a java identifier.
\r
3438 * @deprecated ICU 3.4 (Java)
\r
3440 public static boolean isJavaLetterOrDigit(int cp) {
\r
3441 return isJavaIdentifierPart(cp);
\r
3445 * Compatibility override of Java method, delegates to
\r
3446 * java.lang.Character.isJavaIdentifierStart.
\r
3447 * @param cp the code point
\r
3448 * @return true if the code point can start a java identifier.
\r
3451 public static boolean isJavaIdentifierStart(int cp) {
\r
3452 // note, downcast to char for jdk 1.4 compatibility
\r
3453 return java.lang.Character.isJavaIdentifierStart((char)cp);
\r
3457 * Compatibility override of Java method, delegates to
\r
3458 * java.lang.Character.isJavaIdentifierPart.
\r
3459 * @param cp the code point
\r
3460 * @return true if the code point can continue a java identifier.
\r
3463 public static boolean isJavaIdentifierPart(int cp) {
\r
3464 // note, downcast to char for jdk 1.4 compatibility
\r
3465 return java.lang.Character.isJavaIdentifierPart((char)cp);
\r
3469 * Determines if the specified code point is a lowercase character.
\r
3470 * UnicodeData only contains case mappings for code points where they are
\r
3471 * one-to-one mappings; it also omits information about context-sensitive
\r
3472 * case mappings.<br> For more information about Unicode case mapping
\r
3473 * please refer to the
\r
3474 * <a href=http://www.unicode.org/unicode/reports/tr21/>Technical report
\r
3476 * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
\r
3477 * @param ch code point to determine if it is in lowercase
\r
3478 * @return true if code point is a lowercase character
\r
3481 public static boolean isLowerCase(int ch)
\r
3483 // if props == 0, it will just fall through and return false
\r
3484 return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
\r
3488 * Determines if the specified code point is a white space character.
\r
3489 * A code point is considered to be an whitespace character if and only
\r
3490 * if it satisfies one of the following criteria:
\r
3492 * <li> It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not
\r
3493 * also a non-breaking space (\u00A0 or \u2007 or \u202F).
\r
3494 * <li> It is \u0009, HORIZONTAL TABULATION.
\r
3495 * <li> It is \u000A, LINE FEED.
\r
3496 * <li> It is \u000B, VERTICAL TABULATION.
\r
3497 * <li> It is \u000C, FORM FEED.
\r
3498 * <li> It is \u000D, CARRIAGE RETURN.
\r
3499 * <li> It is \u001C, FILE SEPARATOR.
\r
3500 * <li> It is \u001D, GROUP SEPARATOR.
\r
3501 * <li> It is \u001E, RECORD SEPARATOR.
\r
3502 * <li> It is \u001F, UNIT SEPARATOR.
\r
3505 * This API tries to sync with the semantics of Java's
\r
3506 * java.lang.Character.isWhitespace(), but it may not return
\r
3507 * the exact same results because of the Unicode version
\r
3509 * <p>Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs)
\r
3510 * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false.
\r
3511 * See http://www.unicode.org/versions/Unicode4.0.1/
\r
3512 * @param ch code point to determine if it is a white space
\r
3513 * @return true if the specified code point is a white space character
\r
3516 public static boolean isWhitespace(int ch)
\r
3518 // exclude no-break spaces
\r
3519 // if props == 0, it will just fall through and return false
\r
3520 return ((1 << getType(ch)) &
\r
3521 ((1 << UCharacterCategory.SPACE_SEPARATOR)
\r
3522 | (1 << UCharacterCategory.LINE_SEPARATOR)
\r
3523 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0
\r
3524 && (ch != NO_BREAK_SPACE_) && (ch != FIGURE_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_)
\r
3525 // TAB VT LF FF CR FS GS RS US NL are all control characters
\r
3526 // that are white spaces.
\r
3527 || (ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f);
\r
3531 * Determines if the specified code point is a Unicode specified space
\r
3532 * character, i.e. if code point is in the category Zs, Zl and Zp.
\r
3533 * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
\r
3534 * @param ch code point to determine if it is a space
\r
3535 * @return true if the specified code point is a space character
\r
3538 public static boolean isSpaceChar(int ch)
\r
3540 // if props == 0, it will just fall through and return false
\r
3541 return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
\r
3542 | (1 << UCharacterCategory.LINE_SEPARATOR)
\r
3543 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR)))
\r
3548 * Determines if the specified code point is a titlecase character.
\r
3549 * UnicodeData only contains case mappings for code points where they are
\r
3550 * one-to-one mappings; it also omits information about context-sensitive
\r
3551 * case mappings.<br>
\r
3552 * For more information about Unicode case mapping please refer to the
\r
3553 * <a href=http://www.unicode.org/unicode/reports/tr21/>
\r
3554 * Technical report #21</a>.<br>
\r
3555 * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
\r
3556 * @param ch code point to determine if it is in title case
\r
3557 * @return true if the specified code point is a titlecase character
\r
3560 public static boolean isTitleCase(int ch)
\r
3562 // if props == 0, it will just fall through and return false
\r
3563 return getType(ch) == UCharacterCategory.TITLECASE_LETTER;
\r
3567 * Determines if the specified code point may be any part of a Unicode
\r
3568 * identifier other than the starting character.
\r
3569 * A code point may be part of a Unicode identifier if and only if it is
\r
3570 * one of the following:
\r
3572 * <li> Lu Uppercase letter
\r
3573 * <li> Ll Lowercase letter
\r
3574 * <li> Lt Titlecase letter
\r
3575 * <li> Lm Modifier letter
\r
3576 * <li> Lo Other letter
\r
3577 * <li> Nl Letter number
\r
3578 * <li> Pc Connecting punctuation character
\r
3579 * <li> Nd decimal number
\r
3580 * <li> Mc Spacing combining mark
\r
3581 * <li> Mn Non-spacing mark
\r
3582 * <li> Cf formatting code
\r
3584 * Up-to-date Unicode implementation of
\r
3585 * java.lang.Character.isUnicodeIdentifierPart().<br>
\r
3586 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3587 * @param ch code point to determine if is can be part of a Unicode
\r
3589 * @return true if code point is any character belonging a unicode
\r
3590 * identifier suffix after the first character
\r
3593 public static boolean isUnicodeIdentifierPart(int ch)
\r
3595 // if props == 0, it will just fall through and return false
\r
3597 return ((1 << getType(ch))
\r
3598 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3599 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3600 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3601 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3602 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3603 | (1 << UCharacterCategory.LETTER_NUMBER)
\r
3604 | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
\r
3605 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
\r
3606 | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
\r
3607 | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
\r
3608 || isIdentifierIgnorable(ch);
\r
3612 * Determines if the specified code point is permissible as the first
\r
3613 * character in a Unicode identifier.
\r
3614 * A code point may start a Unicode identifier if it is of type either
\r
3616 * <li> Lu Uppercase letter
\r
3617 * <li> Ll Lowercase letter
\r
3618 * <li> Lt Titlecase letter
\r
3619 * <li> Lm Modifier letter
\r
3620 * <li> Lo Other letter
\r
3621 * <li> Nl Letter number
\r
3623 * Up-to-date Unicode implementation of
\r
3624 * java.lang.Character.isUnicodeIdentifierStart().<br>
\r
3625 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3626 * @param ch code point to determine if it can start a Unicode identifier
\r
3627 * @return true if code point is the first character belonging a unicode
\r
3631 public static boolean isUnicodeIdentifierStart(int ch)
\r
3633 /*int cat = getType(ch);*/
\r
3634 // if props == 0, it will just fall through and return false
\r
3635 return ((1 << getType(ch))
\r
3636 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3637 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3638 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3639 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3640 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3641 | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
\r
3645 * Determines if the specified code point should be regarded as an
\r
3646 * ignorable character in a Java identifier.
\r
3647 * A character is Java-identifier-ignorable if it has the general category
\r
3648 * Cf Formatting Control, or it is a non-Java-whitespace ISO control:
\r
3649 * U+0000..U+0008, U+000E..U+001B, U+007F..U+009F.<br>
\r
3650 * Up-to-date Unicode implementation of
\r
3651 * java.lang.Character.isIdentifierIgnorable().<br>
\r
3652 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3653 * <p>Note that Unicode just recommends to ignore Cf (format controls).
\r
3654 * @param ch code point to be determined if it can be ignored in a Unicode
\r
3656 * @return true if the code point is ignorable
\r
3659 public static boolean isIdentifierIgnorable(int ch)
\r
3661 // see java.lang.Character.isIdentifierIgnorable() on range of
\r
3662 // ignorable characters.
\r
3664 return isISOControl(ch)
\r
3665 && !((ch >= 0x9 && ch <= 0xd)
\r
3666 || (ch >= 0x1c && ch <= 0x1f));
\r
3668 return getType(ch) == UCharacterCategory.FORMAT;
\r
3672 * Determines if the specified code point is an uppercase character.
\r
3673 * UnicodeData only contains case mappings for code point where they are
\r
3674 * one-to-one mappings; it also omits information about context-sensitive
\r
3675 * case mappings.<br>
\r
3676 * For language specific case conversion behavior, use
\r
3677 * toUpperCase(locale, str). <br>
\r
3678 * For example, the case conversion for dot-less i and dotted I in Turkish,
\r
3679 * or for final sigma in Greek.
\r
3680 * For more information about Unicode case mapping please refer to the
\r
3681 * <a href=http://www.unicode.org/unicode/reports/tr21/>
\r
3682 * Technical report #21</a>.<br>
\r
3683 * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
\r
3684 * @param ch code point to determine if it is in uppercase
\r
3685 * @return true if the code point is an uppercase character
\r
3688 public static boolean isUpperCase(int ch)
\r
3690 // if props == 0, it will just fall through and return false
\r
3691 return getType(ch) == UCharacterCategory.UPPERCASE_LETTER;
\r
3695 * The given code point is mapped to its lowercase equivalent; if the code
\r
3696 * point has no lowercase equivalent, the code point itself is returned.
\r
3697 * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
\r
3699 * <p>This function only returns the simple, single-code point case mapping.
\r
3700 * Full case mappings should be used whenever possible because they produce
\r
3701 * better results by working on whole strings.
\r
3702 * They take into account the string context and the language and can map
\r
3703 * to a result string with a different length as appropriate.
\r
3704 * Full case mappings are applied by the case mapping functions
\r
3705 * that take String parameters rather than code points (int).
\r
3706 * See also the User Guide chapter on C/POSIX migration:
\r
3707 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3709 * @param ch code point whose lowercase equivalent is to be retrieved
\r
3710 * @return the lowercase equivalent code point
\r
3713 public static int toLowerCase(int ch) {
\r
3714 return UCaseProps.INSTANCE.tolower(ch);
\r
3718 * Converts argument code point and returns a String object representing
\r
3719 * the code point's value in UTF16 format.
\r
3720 * The result is a string whose length is 1 for non-supplementary code
\r
3721 * points, 2 otherwise.<br>
\r
3722 * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
\r
3724 * Up-to-date Unicode implementation of java.lang.Character.toString()
\r
3725 * @param ch code point
\r
3726 * @return string representation of the code point, null if code point is not
\r
3727 * defined in unicode
\r
3730 public static String toString(int ch)
\r
3732 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
3736 if (ch < SUPPLEMENTARY_MIN_VALUE) {
\r
3737 return String.valueOf((char)ch);
\r
3740 StringBuilder result = new StringBuilder();
\r
3741 result.append(UTF16.getLeadSurrogate(ch));
\r
3742 result.append(UTF16.getTrailSurrogate(ch));
\r
3743 return result.toString();
\r
3747 * Converts the code point argument to titlecase.
\r
3748 * If no titlecase is available, the uppercase is returned. If no uppercase
\r
3749 * is available, the code point itself is returned.
\r
3750 * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
\r
3752 * <p>This function only returns the simple, single-code point case mapping.
\r
3753 * Full case mappings should be used whenever possible because they produce
\r
3754 * better results by working on whole strings.
\r
3755 * They take into account the string context and the language and can map
\r
3756 * to a result string with a different length as appropriate.
\r
3757 * Full case mappings are applied by the case mapping functions
\r
3758 * that take String parameters rather than code points (int).
\r
3759 * See also the User Guide chapter on C/POSIX migration:
\r
3760 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3762 * @param ch code point whose title case is to be retrieved
\r
3763 * @return titlecase code point
\r
3766 public static int toTitleCase(int ch) {
\r
3767 return UCaseProps.INSTANCE.totitle(ch);
\r
3771 * Converts the character argument to uppercase.
\r
3772 * If no uppercase is available, the character itself is returned.
\r
3773 * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
\r
3775 * <p>This function only returns the simple, single-code point case mapping.
\r
3776 * Full case mappings should be used whenever possible because they produce
\r
3777 * better results by working on whole strings.
\r
3778 * They take into account the string context and the language and can map
\r
3779 * to a result string with a different length as appropriate.
\r
3780 * Full case mappings are applied by the case mapping functions
\r
3781 * that take String parameters rather than code points (int).
\r
3782 * See also the User Guide chapter on C/POSIX migration:
\r
3783 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3785 * @param ch code point whose uppercase is to be retrieved
\r
3786 * @return uppercase code point
\r
3789 public static int toUpperCase(int ch) {
\r
3790 return UCaseProps.INSTANCE.toupper(ch);
\r
3793 // extra methods not in java.lang.Character --------------------------
\r
3796 * {@icu} Determines if the code point is a supplementary character.
\r
3797 * A code point is a supplementary character if and only if it is greater
\r
3798 * than <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
\r
3799 * @param ch code point to be determined if it is in the supplementary
\r
3801 * @return true if code point is a supplementary character
\r
3804 public static boolean isSupplementary(int ch)
\r
3806 return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
\r
3807 ch <= UCharacter.MAX_VALUE;
\r
3811 * {@icu} Determines if the code point is in the BMP plane.
\r
3812 * @param ch code point to be determined if it is not a supplementary
\r
3814 * @return true if code point is not a supplementary character
\r
3817 public static boolean isBMP(int ch)
\r
3819 return (ch >= 0 && ch <= LAST_CHAR_MASK_);
\r
3823 * {@icu} Determines whether the specified code point is a printable character
\r
3824 * according to the Unicode standard.
\r
3825 * @param ch code point to be determined if it is printable
\r
3826 * @return true if the code point is a printable character
\r
3829 public static boolean isPrintable(int ch)
\r
3831 int cat = getType(ch);
\r
3832 // if props == 0, it will just fall through and return false
\r
3833 return (cat != UCharacterCategory.UNASSIGNED &&
\r
3834 cat != UCharacterCategory.CONTROL &&
\r
3835 cat != UCharacterCategory.FORMAT &&
\r
3836 cat != UCharacterCategory.PRIVATE_USE &&
\r
3837 cat != UCharacterCategory.SURROGATE &&
\r
3838 cat != UCharacterCategory.GENERAL_OTHER_TYPES);
\r
3842 * {@icu} Determines whether the specified code point is of base form.
\r
3843 * A code point of base form does not graphically combine with preceding
\r
3844 * characters, and is neither a control nor a format character.
\r
3845 * @param ch code point to be determined if it is of base form
\r
3846 * @return true if the code point is of base form
\r
3849 public static boolean isBaseForm(int ch)
\r
3851 int cat = getType(ch);
\r
3852 // if props == 0, it will just fall through and return false
\r
3853 return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
\r
3854 cat == UCharacterCategory.OTHER_NUMBER ||
\r
3855 cat == UCharacterCategory.LETTER_NUMBER ||
\r
3856 cat == UCharacterCategory.UPPERCASE_LETTER ||
\r
3857 cat == UCharacterCategory.LOWERCASE_LETTER ||
\r
3858 cat == UCharacterCategory.TITLECASE_LETTER ||
\r
3859 cat == UCharacterCategory.MODIFIER_LETTER ||
\r
3860 cat == UCharacterCategory.OTHER_LETTER ||
\r
3861 cat == UCharacterCategory.NON_SPACING_MARK ||
\r
3862 cat == UCharacterCategory.ENCLOSING_MARK ||
\r
3863 cat == UCharacterCategory.COMBINING_SPACING_MARK;
\r
3867 * {@icu} Returns the Bidirection property of a code point.
\r
3868 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
\r
3870 * Result returned belongs to the interface
\r
3871 * <a href=UCharacterDirection.html>UCharacterDirection</a>
\r
3872 * @param ch the code point to be determined its direction
\r
3873 * @return direction constant from UCharacterDirection.
\r
3876 public static int getDirection(int ch)
\r
3878 return UBiDiProps.INSTANCE.getClass(ch);
\r
3882 * Determines whether the code point has the "mirrored" property.
\r
3883 * This property is set for characters that are commonly used in
\r
3884 * Right-To-Left contexts and need to be displayed with a "mirrored"
\r
3886 * @param ch code point whose mirror is to be determined
\r
3887 * @return true if the code point has the "mirrored" property
\r
3890 public static boolean isMirrored(int ch)
\r
3892 return UBiDiProps.INSTANCE.isMirrored(ch);
\r
3896 * {@icu} Maps the specified code point to a "mirror-image" code point.
\r
3897 * For code points with the "mirrored" property, implementations sometimes
\r
3898 * need a "poor man's" mapping to another code point such that the default
\r
3899 * glyph may serve as the mirror-image of the default glyph of the
\r
3900 * specified code point.<br>
\r
3901 * This is useful for text conversion to and from codepages with visual
\r
3902 * order, and for displays without glyph selection capabilities.
\r
3903 * @param ch code point whose mirror is to be retrieved
\r
3904 * @return another code point that may serve as a mirror-image substitute,
\r
3905 * or ch itself if there is no such mapping or ch does not have the
\r
3906 * "mirrored" property
\r
3909 public static int getMirror(int ch)
\r
3911 return UBiDiProps.INSTANCE.getMirror(ch);
\r
3915 * {@icu} Returns the combining class of the argument codepoint
\r
3916 * @param ch code point whose combining is to be retrieved
\r
3917 * @return the combining class of the codepoint
\r
3920 public static int getCombiningClass(int ch)
\r
3922 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
3923 throw new IllegalArgumentException("Codepoint out of bounds");
\r
3925 Normalizer2Impl impl = Norm2AllModes.getNFCInstance().impl;
\r
3926 return impl.getCC(impl.getNorm16(ch));
\r
3930 * {@icu} A code point is illegal if and only if
\r
3932 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
\r
3933 * <li> A surrogate value, 0xD800 to 0xDFFF
\r
3934 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
\r
3936 * Note: legal does not mean that it is assigned in this version of Unicode.
\r
3937 * @param ch code point to determine if it is a legal code point by itself
\r
3938 * @return true if and only if legal.
\r
3941 public static boolean isLegal(int ch)
\r
3943 if (ch < MIN_VALUE) {
\r
3946 if (ch < UTF16.SURROGATE_MIN_VALUE) {
\r
3949 if (ch <= UTF16.SURROGATE_MAX_VALUE) {
\r
3952 if (UCharacterUtility.isNonCharacter(ch)) {
\r
3955 return (ch <= MAX_VALUE);
\r
3959 * {@icu} A string is legal iff all its code points are legal.
\r
3960 * A code point is illegal if and only if
\r
3962 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
\r
3963 * <li> A surrogate value, 0xD800 to 0xDFFF
\r
3964 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
\r
3966 * Note: legal does not mean that it is assigned in this version of Unicode.
\r
3967 * @param str containing code points to examin
\r
3968 * @return true if and only if legal.
\r
3971 public static boolean isLegal(String str)
\r
3973 int size = str.length();
\r
3975 for (int i = 0; i < size; i ++)
\r
3977 codepoint = UTF16.charAt(str, i);
\r
3978 if (!isLegal(codepoint)) {
\r
3981 if (isSupplementary(codepoint)) {
\r
3989 * {@icu} Returns the version of Unicode data used.
\r
3990 * @return the unicode version number used
\r
3993 public static VersionInfo getUnicodeVersion()
\r
3995 return UCharacterProperty.INSTANCE.m_unicodeVersion_;
\r
3999 * {@icu} Returns the most current Unicode name of the argument code point, or
\r
4000 * null if the character is unassigned or outside the range
\r
4001 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
\r
4003 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4004 * incurs a one-time initialisation cost to construct the name tables.
\r
4005 * @param ch the code point for which to get the name
\r
4006 * @return most current Unicode name
\r
4009 public static String getName(int ch)
\r
4011 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
4015 * {@icu} Returns the names for each of the characters in a string
\r
4016 * @param s string to format
\r
4017 * @param separator string to go between names
\r
4018 * @return string of names
\r
4021 public static String getName(String s, String separator) {
\r
4022 if (s.length() == 1) { // handle common case
\r
4023 return getName(s.charAt(0));
\r
4026 StringBuilder sb = new StringBuilder();
\r
4027 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
\r
4028 cp = UTF16.charAt(s,i);
\r
4029 if (i != 0) sb.append(separator);
\r
4030 sb.append(UCharacter.getName(cp));
\r
4032 return sb.toString();
\r
4036 * {@icu} Returns the earlier version 1.0 Unicode name of the argument code
\r
4037 * point, or null if the character is unassigned or outside the range
\r
4038 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
\r
4040 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4041 * incurs a one-time initialisation cost to construct the name tables.
\r
4042 * @param ch the code point for which to get the name
\r
4043 * @return version 1.0 Unicode name
\r
4046 public static String getName1_0(int ch)
\r
4048 return UCharacterName.INSTANCE.getName(ch,
\r
4049 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
4053 * {@icu} Returns a name for a valid codepoint. Unlike, getName(int) and
\r
4054 * getName1_0(int), this method will return a name even for codepoints that
\r
4055 * are not assigned a name in UnicodeData.txt.
\r
4057 * The names are returned in the following order.
\r
4059 * <li> Most current Unicode name if there is any
\r
4060 * <li> Unicode 1.0 name if there is any
\r
4061 * <li> Extended name in the form of
\r
4062 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-fffe>
\r
4064 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4065 * incurs a one-time initialisation cost to construct the name tables.
\r
4066 * @param ch the code point for which to get the name
\r
4067 * @return a name for the argument codepoint
\r
4070 public static String getExtendedName(int ch) {
\r
4071 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.EXTENDED_CHAR_NAME);
\r
4075 * {@icu} Returns the corrected name from NameAliases.txt if there is one.
\r
4076 * Returns null if the character is unassigned or outside the range
\r
4077 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
\r
4079 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4080 * incurs a one-time initialisation cost to construct the name tables.
\r
4081 * @param ch the code point for which to get the name alias
\r
4082 * @return Unicode name alias, or null
\r
4084 * @provisional This API might change or be removed in a future release.
\r
4086 public static String getNameAlias(int ch)
\r
4088 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.CHAR_NAME_ALIAS);
\r
4092 * {@icu} Returns the ISO 10646 comment for a character.
\r
4093 * The ISO 10646 comment is an informative field in the Unicode Character
\r
4094 * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list.
\r
4096 * Note: Unicode 5.2 removes all ISO comment data, resulting in empty strings
\r
4097 * returned for all characters.
\r
4099 * @param ch The code point for which to get the ISO comment.
\r
4100 * It must be the case that {@code 0 <= ch <= 0x10ffff}.
\r
4101 * @return The ISO comment, or null if there is no comment for this
\r
4105 public static String getISOComment(int ch)
\r
4107 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
\r
4111 String result = UCharacterName.INSTANCE.getGroupName(ch,
\r
4112 UCharacterNameChoice.ISO_COMMENT_);
\r
4117 * {@icu} <p>Finds a Unicode code point by its most current Unicode name and
\r
4118 * return its code point value. All Unicode names are in uppercase.</p>
\r
4119 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4120 * incurs a one-time initialisation cost to construct the name tables.
\r
4121 * @param name most current Unicode character name whose code point is to
\r
4123 * @return code point or -1 if name is not found
\r
4126 public static int getCharFromName(String name){
\r
4127 return UCharacterName.INSTANCE.getCharFromName(
\r
4128 UCharacterNameChoice.UNICODE_CHAR_NAME, name);
\r
4132 * {@icu} <p>Find a Unicode character by its version 1.0 Unicode name and return
\r
4133 * its code point value. All Unicode names are in uppercase.</p>
\r
4134 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4135 * incurs a one-time initialisation cost to construct the name tables.
\r
4136 * @param name Unicode 1.0 code point name whose code point is to
\r
4138 * @return code point or -1 if name is not found
\r
4141 public static int getCharFromName1_0(String name){
\r
4142 return UCharacterName.INSTANCE.getCharFromName(
\r
4143 UCharacterNameChoice.UNICODE_10_CHAR_NAME, name);
\r
4147 * {@icu} <p>Find a Unicode character by either its name and return its code
\r
4148 * point value. All Unicode names are in uppercase.
\r
4149 * Extended names are all lowercase except for numbers and are contained
\r
4150 * within angle brackets.</p>
\r
4151 * The names are searched in the following order
\r
4153 * <li> Most current Unicode name if there is any
\r
4154 * <li> Unicode 1.0 name if there is any
\r
4155 * <li> Extended name in the form of
\r
4156 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-FFFE>
\r
4158 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4159 * incurs a one-time initialisation cost to construct the name tables.
\r
4160 * @param name codepoint name
\r
4161 * @return code point associated with the name or -1 if the name is not
\r
4165 public static int getCharFromExtendedName(String name){
\r
4166 return UCharacterName.INSTANCE.getCharFromName(
\r
4167 UCharacterNameChoice.EXTENDED_CHAR_NAME, name);
\r
4171 * {@icu} <p>Find a Unicode character by its corrected name alias and return
\r
4172 * its code point value. All Unicode names are in uppercase.</p>
\r
4173 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4174 * incurs a one-time initialisation cost to construct the name tables.
\r
4175 * @param name Unicode name alias whose code point is to be returned
\r
4176 * @return code point or -1 if name is not found
\r
4178 * @provisional This API might change or be removed in a future release.
\r
4180 public static int getCharFromNameAlias(String name){
\r
4181 return UCharacterName.INSTANCE.getCharFromName(UCharacterNameChoice.CHAR_NAME_ALIAS, name);
\r
4185 * {@icu} Return the Unicode name for a given property, as given in the
\r
4186 * Unicode database file PropertyAliases.txt. Most properties
\r
4187 * have more than one name. The nameChoice determines which one
\r
4190 * In addition, this function maps the property
\r
4191 * UProperty.GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
\r
4192 * "General_Category_Mask". These names are not in
\r
4193 * PropertyAliases.txt.
\r
4195 * @param property UProperty selector.
\r
4197 * @param nameChoice UProperty.NameChoice selector for which name
\r
4198 * to get. All properties have a long name. Most have a short
\r
4199 * name, but some do not. Unicode allows for additional names; if
\r
4200 * present these will be returned by UProperty.NameChoice.LONG + i,
\r
4201 * where i=1, 2,...
\r
4203 * @return a name, or null if Unicode explicitly defines no name
\r
4204 * ("n/a") for a given property/nameChoice. If a given nameChoice
\r
4205 * throws an exception, then all larger values of nameChoice will
\r
4206 * throw an exception. If null is returned for a given
\r
4207 * nameChoice, then other nameChoice values may return non-null
\r
4210 * @exception IllegalArgumentException thrown if property or
\r
4211 * nameChoice are invalid.
\r
4214 * @see UProperty.NameChoice
\r
4217 public static String getPropertyName(int property,
\r
4219 return UPropertyAliases.INSTANCE.getPropertyName(property, nameChoice);
\r
4223 * {@icu} Return the UProperty selector for a given property name, as
\r
4224 * specified in the Unicode database file PropertyAliases.txt.
\r
4225 * Short, long, and any other variants are recognized.
\r
4227 * In addition, this function maps the synthetic names "gcm" /
\r
4228 * "General_Category_Mask" to the property
\r
4229 * UProperty.GENERAL_CATEGORY_MASK. These names are not in
\r
4230 * PropertyAliases.txt.
\r
4232 * @param propertyAlias the property name to be matched. The name
\r
4233 * is compared using "loose matching" as described in
\r
4234 * PropertyAliases.txt.
\r
4236 * @return a UProperty enum.
\r
4238 * @exception IllegalArgumentException thrown if propertyAlias
\r
4239 * is not recognized.
\r
4244 public static int getPropertyEnum(String propertyAlias) {
\r
4245 int propEnum = UPropertyAliases.INSTANCE.getPropertyEnum(propertyAlias);
\r
4246 if (propEnum == UProperty.UNDEFINED) {
\r
4247 throw new IllegalIcuArgumentException("Invalid name: " + propertyAlias);
\r
4253 * {@icu} Return the Unicode name for a given property value, as given in
\r
4254 * the Unicode database file PropertyValueAliases.txt. Most
\r
4255 * values have more than one name. The nameChoice determines
\r
4256 * which one is returned.
\r
4258 * Note: Some of the names in PropertyValueAliases.txt can only be
\r
4259 * retrieved using UProperty.GENERAL_CATEGORY_MASK, not
\r
4260 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
\r
4261 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
\r
4262 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
\r
4264 * @param property UProperty selector constant.
\r
4265 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
\r
4266 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
\r
4267 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
\r
4268 * If out of range, null is returned.
\r
4270 * @param value selector for a value for the given property. In
\r
4271 * general, valid values range from 0 up to some maximum. There
\r
4272 * are a few exceptions: (1.) UProperty.BLOCK values begin at the
\r
4273 * non-zero value BASIC_LATIN.getID(). (2.)
\r
4274 * UProperty.CANONICAL_COMBINING_CLASS values are not contiguous
\r
4275 * and range from 0..240. (3.) UProperty.GENERAL_CATEGORY_MASK values
\r
4276 * are mask values produced by left-shifting 1 by
\r
4277 * UCharacter.getType(). This allows grouped categories such as
\r
4278 * [:L:] to be represented. Mask values are non-contiguous.
\r
4280 * @param nameChoice UProperty.NameChoice selector for which name
\r
4281 * to get. All values have a long name. Most have a short name,
\r
4282 * but some do not. Unicode allows for additional names; if
\r
4283 * present these will be returned by UProperty.NameChoice.LONG + i,
\r
4284 * where i=1, 2,...
\r
4286 * @return a name, or null if Unicode explicitly defines no name
\r
4287 * ("n/a") for a given property/value/nameChoice. If a given
\r
4288 * nameChoice throws an exception, then all larger values of
\r
4289 * nameChoice will throw an exception. If null is returned for a
\r
4290 * given nameChoice, then other nameChoice values may return
\r
4291 * non-null results.
\r
4293 * @exception IllegalArgumentException thrown if property, value,
\r
4294 * or nameChoice are invalid.
\r
4297 * @see UProperty.NameChoice
\r
4300 public static String getPropertyValueName(int property,
\r
4304 if ((property == UProperty.CANONICAL_COMBINING_CLASS
\r
4305 || property == UProperty.LEAD_CANONICAL_COMBINING_CLASS
\r
4306 || property == UProperty.TRAIL_CANONICAL_COMBINING_CLASS)
\r
4307 && value >= UCharacter.getIntPropertyMinValue(
\r
4308 UProperty.CANONICAL_COMBINING_CLASS)
\r
4309 && value <= UCharacter.getIntPropertyMaxValue(
\r
4310 UProperty.CANONICAL_COMBINING_CLASS)
\r
4311 && nameChoice >= 0 && nameChoice < UProperty.NameChoice.COUNT) {
\r
4312 // this is hard coded for the valid cc
\r
4313 // because PropertyValueAliases.txt does not contain all of them
\r
4315 return UPropertyAliases.INSTANCE.getPropertyValueName(property, value,
\r
4318 catch (IllegalArgumentException e) {
\r
4322 return UPropertyAliases.INSTANCE.getPropertyValueName(property, value, nameChoice);
\r
4326 * {@icu} Return the property value integer for a given value name, as
\r
4327 * specified in the Unicode database file PropertyValueAliases.txt.
\r
4328 * Short, long, and any other variants are recognized.
\r
4330 * Note: Some of the names in PropertyValueAliases.txt will only be
\r
4331 * recognized with UProperty.GENERAL_CATEGORY_MASK, not
\r
4332 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
\r
4333 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
\r
4334 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
\r
4336 * @param property UProperty selector constant.
\r
4337 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
\r
4338 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
\r
4339 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
\r
4340 * Only these properties can be enumerated.
\r
4342 * @param valueAlias the value name to be matched. The name is
\r
4343 * compared using "loose matching" as described in
\r
4344 * PropertyValueAliases.txt.
\r
4346 * @return a value integer. Note: UProperty.GENERAL_CATEGORY
\r
4347 * values are mask values produced by left-shifting 1 by
\r
4348 * UCharacter.getType(). This allows grouped categories such as
\r
4349 * [:L:] to be represented.
\r
4352 * @throws IllegalArgumentException if property is not a valid UProperty
\r
4356 public static int getPropertyValueEnum(int property, String valueAlias) {
\r
4357 int propEnum = UPropertyAliases.INSTANCE.getPropertyValueEnum(property, valueAlias);
\r
4358 if (propEnum == UProperty.UNDEFINED) {
\r
4359 throw new IllegalIcuArgumentException("Invalid name: " + valueAlias);
\r
4365 * {@icu} Returns a code point corresponding to the two UTF16 characters.
\r
4366 * @param lead the lead char
\r
4367 * @param trail the trail char
\r
4368 * @return code point if surrogate characters are valid.
\r
4369 * @exception IllegalArgumentException thrown when argument characters do
\r
4370 * not form a valid codepoint
\r
4373 public static int getCodePoint(char lead, char trail)
\r
4375 if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
\r
4376 return UCharacterProperty.getRawSupplementary(lead, trail);
\r
4378 throw new IllegalArgumentException("Illegal surrogate characters");
\r
4382 * {@icu} Returns the code point corresponding to the UTF16 character.
\r
4383 * @param char16 the UTF16 character
\r
4384 * @return code point if argument is a valid character.
\r
4385 * @exception IllegalArgumentException thrown when char16 is not a valid
\r
4389 public static int getCodePoint(char char16)
\r
4391 if (UCharacter.isLegal(char16)) {
\r
4394 throw new IllegalArgumentException("Illegal codepoint");
\r
4398 * Implementation of UCaseProps.ContextIterator, iterates over a String.
\r
4399 * See ustrcase.c/utf16_caseContextIterator().
\r
4401 private static class StringContextIterator implements UCaseProps.ContextIterator {
\r
4404 * @param s String to iterate over.
\r
4406 StringContextIterator(String s) {
\r
4409 cpStart=cpLimit=index=0;
\r
4414 * Set the iteration limit for nextCaseMapCP() to an index within the string.
\r
4415 * If the limit parameter is negative or past the string, then the
\r
4416 * string length is restored as the iteration limit.
\r
4418 * This limit does not affect the next() function which always
\r
4419 * iterates to the very end of the string.
\r
4421 * @param lim The iteration limit.
\r
4423 public void setLimit(int lim) {
\r
4424 if(0<=lim && lim<=s.length()) {
\r
4432 * Move to the iteration limit without fetching code points up to there.
\r
4434 public void moveToLimit() {
\r
4435 cpStart=cpLimit=limit;
\r
4439 * Iterate forward through the string to fetch the next code point
\r
4440 * to be case-mapped, and set the context indexes for it.
\r
4441 * Performance optimization, to save on function calls and redundant
\r
4442 * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
\r
4444 * When the iteration limit is reached (and -1 is returned),
\r
4445 * getCPStart() will be at the iteration limit.
\r
4447 * Iteration with next() does not affect the position for nextCaseMapCP().
\r
4449 * @return The next code point to be case-mapped, or <0 when the iteration is done.
\r
4451 public int nextCaseMapCP() {
\r
4453 if(cpLimit<limit) {
\r
4454 int c=s.charAt(cpLimit++);
\r
4455 if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
4457 if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
\r
4458 UTF16.TRAIL_SURROGATE_MIN_VALUE<=(c2=s.charAt(cpLimit)) &&
\r
4459 c2<=UTF16.TRAIL_SURROGATE_MAX_VALUE
\r
4461 // supplementary code point
\r
4463 c=UCharacterProperty.getRawSupplementary((char)c, c2);
\r
4464 // else unpaired surrogate code point
\r
4466 // else BMP code point
\r
4475 * Returns the start of the code point that was last returned
\r
4476 * by nextCaseMapCP().
\r
4478 public int getCPStart() {
\r
4483 * Returns the limit of the code point that was last returned
\r
4484 * by nextCaseMapCP().
\r
4486 public int getCPLimit() {
\r
4490 // implement UCaseProps.ContextIterator
\r
4491 // The following code is not used anywhere in this private class
\r
4492 public void reset(int direction) {
\r
4494 /* reset for forward iteration */
\r
4497 } else if(direction<0) {
\r
4498 /* reset for backward iteration */
\r
4502 // not a valid direction
\r
4508 public int next() {
\r
4511 if(dir>0 && index<s.length()) {
\r
4512 c=UTF16.charAt(s, index);
\r
4513 index+=UTF16.getCharCount(c);
\r
4515 } else if(dir<0 && index>0) {
\r
4516 c=UTF16.charAt(s, index-1);
\r
4517 index-=UTF16.getCharCount(c);
\r
4524 protected String s;
\r
4525 protected int index, limit, cpStart, cpLimit;
\r
4526 protected int dir; // 0=initial state >0=forward <0=backward
\r
4530 * Returns the uppercase version of the argument string.
\r
4531 * Casing is dependent on the default locale and context-sensitive.
\r
4532 * @param str source string to be performed on
\r
4533 * @return uppercase version of the argument string
\r
4536 public static String toUpperCase(String str)
\r
4538 return toUpperCase(ULocale.getDefault(), str);
\r
4542 * Returns the lowercase version of the argument string.
\r
4543 * Casing is dependent on the default locale and context-sensitive
\r
4544 * @param str source string to be performed on
\r
4545 * @return lowercase version of the argument string
\r
4548 public static String toLowerCase(String str)
\r
4550 return toLowerCase(ULocale.getDefault(), str);
\r
4554 * <p>Returns the titlecase version of the argument string.</p>
\r
4555 * <p>Position for titlecasing is determined by the argument break
\r
4556 * iterator, hence the user can customize his break iterator for
\r
4557 * a specialized titlecasing. In this case only the forward iteration
\r
4558 * needs to be implemented.
\r
4559 * If the break iterator passed in is null, the default Unicode algorithm
\r
4560 * will be used to determine the titlecase positions.
\r
4562 * <p>Only positions returned by the break iterator will be title cased,
\r
4563 * character in between the positions will all be in lower case.</p>
\r
4564 * <p>Casing is dependent on the default locale and context-sensitive</p>
\r
4565 * @param str source string to be performed on
\r
4566 * @param breakiter break iterator to determine the positions in which
\r
4567 * the character should be title cased.
\r
4568 * @return lowercase version of the argument string
\r
4571 public static String toTitleCase(String str, BreakIterator breakiter)
\r
4573 return toTitleCase(ULocale.getDefault(), str, breakiter);
\r
4577 * Returns the uppercase version of the argument string.
\r
4578 * Casing is dependent on the argument locale and context-sensitive.
\r
4579 * @param locale which string is to be converted in
\r
4580 * @param str source string to be performed on
\r
4581 * @return uppercase version of the argument string
\r
4584 public static String toUpperCase(Locale locale, String str)
\r
4586 return toUpperCase(ULocale.forLocale(locale), str);
\r
4590 * Returns the uppercase version of the argument string.
\r
4591 * Casing is dependent on the argument locale and context-sensitive.
\r
4592 * @param locale which string is to be converted in
\r
4593 * @param str source string to be performed on
\r
4594 * @return uppercase version of the argument string
\r
4597 public static String toUpperCase(ULocale locale, String str) {
\r
4598 StringContextIterator iter = new StringContextIterator(str);
\r
4599 StringBuffer result = new StringBuffer(str.length());
\r
4600 int[] locCache = new int[1];
\r
4603 if (locale == null) {
\r
4604 locale = ULocale.getDefault();
\r
4608 while((c=iter.nextCaseMapCP())>=0) {
\r
4609 c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache);
\r
4611 /* decode the result */
\r
4613 /* (not) original code point */
\r
4615 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4616 /* mapping already appended to result */
\r
4618 /* } else { append single-code point mapping */
\r
4621 result.append((char)c);
\r
4623 UTF16.append(result, c);
\r
4626 return result.toString();
\r
4630 * Returns the lowercase version of the argument string.
\r
4631 * Casing is dependent on the argument locale and context-sensitive
\r
4632 * @param locale which string is to be converted in
\r
4633 * @param str source string to be performed on
\r
4634 * @return lowercase version of the argument string
\r
4637 public static String toLowerCase(Locale locale, String str)
\r
4639 return toLowerCase(ULocale.forLocale(locale), str);
\r
4643 * Returns the lowercase version of the argument string.
\r
4644 * Casing is dependent on the argument locale and context-sensitive
\r
4645 * @param locale which string is to be converted in
\r
4646 * @param str source string to be performed on
\r
4647 * @return lowercase version of the argument string
\r
4650 public static String toLowerCase(ULocale locale, String str) {
\r
4651 StringContextIterator iter = new StringContextIterator(str);
\r
4652 StringBuffer result = new StringBuffer(str.length());
\r
4653 int[] locCache = new int[1];
\r
4656 if (locale == null) {
\r
4657 locale = ULocale.getDefault();
\r
4661 while((c=iter.nextCaseMapCP())>=0) {
\r
4662 c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache);
\r
4664 /* decode the result */
\r
4666 /* (not) original code point */
\r
4668 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4669 /* mapping already appended to result */
\r
4671 /* } else { append single-code point mapping */
\r
4674 result.append((char)c);
\r
4676 UTF16.append(result, c);
\r
4679 return result.toString();
\r
4683 * <p>Returns the titlecase version of the argument string.</p>
\r
4684 * <p>Position for titlecasing is determined by the argument break
\r
4685 * iterator, hence the user can customize his break iterator for
\r
4686 * a specialized titlecasing. In this case only the forward iteration
\r
4687 * needs to be implemented.
\r
4688 * If the break iterator passed in is null, the default Unicode algorithm
\r
4689 * will be used to determine the titlecase positions.
\r
4691 * <p>Only positions returned by the break iterator will be title cased,
\r
4692 * character in between the positions will all be in lower case.</p>
\r
4693 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4694 * @param locale which string is to be converted in
\r
4695 * @param str source string to be performed on
\r
4696 * @param breakiter break iterator to determine the positions in which
\r
4697 * the character should be title cased.
\r
4698 * @return lowercase version of the argument string
\r
4701 public static String toTitleCase(Locale locale, String str,
\r
4702 BreakIterator breakiter)
\r
4704 return toTitleCase(ULocale.forLocale(locale), str, breakiter);
\r
4708 * <p>Returns the titlecase version of the argument string.</p>
\r
4709 * <p>Position for titlecasing is determined by the argument break
\r
4710 * iterator, hence the user can customize his break iterator for
\r
4711 * a specialized titlecasing. In this case only the forward iteration
\r
4712 * needs to be implemented.
\r
4713 * If the break iterator passed in is null, the default Unicode algorithm
\r
4714 * will be used to determine the titlecase positions.
\r
4716 * <p>Only positions returned by the break iterator will be title cased,
\r
4717 * character in between the positions will all be in lower case.</p>
\r
4718 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4719 * @param locale which string is to be converted in
\r
4720 * @param str source string to be performed on
\r
4721 * @param titleIter break iterator to determine the positions in which
\r
4722 * the character should be title cased.
\r
4723 * @return lowercase version of the argument string
\r
4726 public static String toTitleCase(ULocale locale, String str,
\r
4727 BreakIterator titleIter) {
\r
4728 return toTitleCase(locale, str, titleIter, 0);
\r
4732 * <p>Returns the titlecase version of the argument string.</p>
\r
4733 * <p>Position for titlecasing is determined by the argument break
\r
4734 * iterator, hence the user can customize his break iterator for
\r
4735 * a specialized titlecasing. In this case only the forward iteration
\r
4736 * needs to be implemented.
\r
4737 * If the break iterator passed in is null, the default Unicode algorithm
\r
4738 * will be used to determine the titlecase positions.
\r
4740 * <p>Only positions returned by the break iterator will be title cased,
\r
4741 * character in between the positions will all be in lower case.</p>
\r
4742 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4743 * @param locale which string is to be converted in
\r
4744 * @param str source string to be performed on
\r
4745 * @param titleIter break iterator to determine the positions in which
\r
4746 * the character should be title cased.
\r
4747 * @param options bit set to modify the titlecasing operation
\r
4748 * @return lowercase version of the argument string
\r
4750 * @see #TITLECASE_NO_LOWERCASE
\r
4751 * @see #TITLECASE_NO_BREAK_ADJUSTMENT
\r
4753 public static String toTitleCase(ULocale locale, String str,
\r
4754 BreakIterator titleIter,
\r
4756 StringContextIterator iter = new StringContextIterator(str);
\r
4757 StringBuffer result = new StringBuffer(str.length());
\r
4758 int[] locCache = new int[1];
\r
4759 int c, nc, srcLength = str.length();
\r
4761 if (locale == null) {
\r
4762 locale = ULocale.getDefault();
\r
4766 if(titleIter == null) {
\r
4767 titleIter = BreakIterator.getWordInstance(locale);
\r
4769 titleIter.setText(str);
\r
4771 int prev, titleStart, index;
\r
4772 boolean isFirstIndex;
\r
4773 boolean isDutch = locale.getLanguage().equals("nl");
\r
4774 boolean FirstIJ = true;
\r
4776 /* set up local variables */
\r
4778 isFirstIndex=true;
\r
4780 /* titlecasing loop */
\r
4781 while(prev<srcLength) {
\r
4782 /* find next index where to titlecase */
\r
4783 if(isFirstIndex) {
\r
4784 isFirstIndex=false;
\r
4785 index=titleIter.first();
\r
4787 index=titleIter.next();
\r
4789 if(index==BreakIterator.DONE || index>srcLength) {
\r
4794 * Unicode 4 & 5 section 3.13 Default Case Operations:
\r
4796 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
\r
4797 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
\r
4798 * cased character F. If F exists, map F to default_title(F); then map each
\r
4799 * subsequent character C to default_lower(C).
\r
4801 * In this implementation, segment [prev..index[ into 3 parts:
\r
4802 * a) uncased characters (copy as-is) [prev..titleStart[
\r
4803 * b) first case letter (titlecase) [titleStart..titleLimit[
\r
4804 * c) subsequent characters (lowercase) [titleLimit..index[
\r
4807 /* find and copy uncased characters [prev..titleStart[ */
\r
4808 iter.setLimit(index);
\r
4809 c=iter.nextCaseMapCP();
\r
4810 if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0
\r
4811 && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
\r
4812 while((c=iter.nextCaseMapCP())>=0
\r
4813 && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
\r
4814 titleStart=iter.getCPStart();
\r
4815 if(prev<titleStart) {
\r
4816 result.append(str, prev, titleStart);
\r
4822 if(titleStart<index) {
\r
4824 /* titlecase c which is from titleStart */
\r
4825 c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, locale, locCache);
\r
4827 /* decode the result and lowercase up to index */
\r
4830 /* (not) original code point */
\r
4833 result.append((char)c);
\r
4835 UTF16.append(result, c);
\r
4837 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4838 /* mapping already appended to result */
\r
4840 /* append single-code point mapping */
\r
4842 result.append((char)c);
\r
4844 UTF16.append(result, c);
\r
4848 if((options&TITLECASE_NO_LOWERCASE)!=0) {
\r
4849 /* Optionally just copy the rest of the word unchanged. */
\r
4851 int titleLimit=iter.getCPLimit();
\r
4852 if(titleLimit<index) {
\r
4853 // TODO: With Java 5, this would want to be
\r
4854 // result.append(str, titleLimit, index);
\r
4855 String appendStr = str.substring(titleLimit,index);
\r
4856 /* Special Case - Dutch IJ Titlecasing */
\r
4857 if ( isDutch && c == 0x0049 && appendStr.startsWith("j")) {
\r
4858 appendStr = "J" + appendStr.substring(1);
\r
4860 result.append(appendStr);
\r
4862 iter.moveToLimit();
\r
4864 } else if((nc=iter.nextCaseMapCP())>=0) {
\r
4865 if (isDutch && (nc == 0x004A || nc == 0x006A)
\r
4866 && (c == 0x0049) && (FirstIJ == true)) {
\r
4867 c = 0x004A; /* J */
\r
4870 /* Normal operation: Lowercase the rest of the word. */
\r
4871 c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, locale,
\r
4883 return result.toString();
\r
4887 * {@icu} The given character is mapped to its case folding equivalent according
\r
4888 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
\r
4889 * folding equivalent, the character itself is returned.
\r
4891 * <p>This function only returns the simple, single-code point case mapping.
\r
4892 * Full case mappings should be used whenever possible because they produce
\r
4893 * better results by working on whole strings.
\r
4894 * They can map to a result string with a different length as appropriate.
\r
4895 * Full case mappings are applied by the case mapping functions
\r
4896 * that take String parameters rather than code points (int).
\r
4897 * See also the User Guide chapter on C/POSIX migration:
\r
4898 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
4900 * @param ch the character to be converted
\r
4901 * @param defaultmapping Indicates if all mappings defined in
\r
4902 * CaseFolding.txt is to be used, otherwise the
\r
4903 * mappings for dotted I and dotless i marked with
\r
4904 * 'I' in CaseFolding.txt will be skipped.
\r
4905 * @return the case folding equivalent of the character, if
\r
4906 * any; otherwise the character itself.
\r
4907 * @see #foldCase(String, boolean)
\r
4910 public static int foldCase(int ch, boolean defaultmapping) {
\r
4911 return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
4915 * {@icu} The given string is mapped to its case folding equivalent according to
\r
4916 * UnicodeData.txt and CaseFolding.txt; if any character has no case
\r
4917 * folding equivalent, the character itself is returned.
\r
4918 * "Full", multiple-code point case folding mappings are returned here.
\r
4919 * For "simple" single-code point mappings use the API
\r
4920 * foldCase(int ch, boolean defaultmapping).
\r
4921 * @param str the String to be converted
\r
4922 * @param defaultmapping Indicates if all mappings defined in
\r
4923 * CaseFolding.txt is to be used, otherwise the
\r
4924 * mappings for dotted I and dotless i marked with
\r
4925 * 'I' in CaseFolding.txt will be skipped.
\r
4926 * @return the case folding equivalent of the character, if
\r
4927 * any; otherwise the character itself.
\r
4928 * @see #foldCase(int, boolean)
\r
4931 public static String foldCase(String str, boolean defaultmapping) {
\r
4932 return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
4936 * {@icu} Option value for case folding: use default mappings defined in
\r
4937 * CaseFolding.txt.
\r
4940 public static final int FOLD_CASE_DEFAULT = 0x0000;
\r
4942 * {@icu} Option value for case folding: exclude the mappings for dotted I
\r
4943 * and dotless i marked with 'I' in CaseFolding.txt.
\r
4946 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001;
\r
4949 * {@icu} The given character is mapped to its case folding equivalent according
\r
4950 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
\r
4951 * folding equivalent, the character itself is returned.
\r
4953 * <p>This function only returns the simple, single-code point case mapping.
\r
4954 * Full case mappings should be used whenever possible because they produce
\r
4955 * better results by working on whole strings.
\r
4956 * They can map to a result string with a different length as appropriate.
\r
4957 * Full case mappings are applied by the case mapping functions
\r
4958 * that take String parameters rather than code points (int).
\r
4959 * See also the User Guide chapter on C/POSIX migration:
\r
4960 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
4962 * @param ch the character to be converted
\r
4963 * @param options A bit set for special processing. Currently the recognised options
\r
4964 * are FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
\r
4965 * @return the case folding equivalent of the character, if any; otherwise the
\r
4966 * character itself.
\r
4967 * @see #foldCase(String, boolean)
\r
4970 public static int foldCase(int ch, int options) {
\r
4971 return UCaseProps.INSTANCE.fold(ch, options);
\r
4975 * {@icu} The given string is mapped to its case folding equivalent according to
\r
4976 * UnicodeData.txt and CaseFolding.txt; if any character has no case
\r
4977 * folding equivalent, the character itself is returned.
\r
4978 * "Full", multiple-code point case folding mappings are returned here.
\r
4979 * For "simple" single-code point mappings use the API
\r
4980 * foldCase(int ch, boolean defaultmapping).
\r
4981 * @param str the String to be converted
\r
4982 * @param options A bit set for special processing. Currently the recognised options
\r
4983 * are FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
\r
4984 * @return the case folding equivalent of the character, if any; otherwise the
\r
4985 * character itself.
\r
4986 * @see #foldCase(int, boolean)
\r
4989 public static final String foldCase(String str, int options) {
\r
4990 StringBuffer result = new StringBuffer(str.length());
\r
4993 length = str.length();
\r
4994 for(i=0; i<length;) {
\r
4995 c=UTF16.charAt(str, i);
\r
4996 i+=UTF16.getCharCount(c);
\r
4997 c = UCaseProps.INSTANCE.toFullFolding(c, result, options);
\r
4999 /* decode the result */
\r
5001 /* (not) original code point */
\r
5003 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
5004 /* mapping already appended to result */
\r
5006 /* } else { append single-code point mapping */
\r
5009 result.append((char)c);
\r
5011 UTF16.append(result, c);
\r
5014 return result.toString();
\r
5018 * {@icu} Return numeric value of Han code points.
\r
5019 * <br> This returns the value of Han 'numeric' code points,
\r
5020 * including those for zero, ten, hundred, thousand, ten thousand,
\r
5021 * and hundred million.
\r
5022 * This includes both the standard and 'checkwriting'
\r
5023 * characters, the 'big circle' zero character, and the standard
\r
5025 * @param ch code point to query
\r
5026 * @return value if it is a Han 'numeric character,' otherwise return -1.
\r
5029 public static int getHanNumericValue(int ch)
\r
5031 // TODO: Are these all covered by Unicode numeric value data?
\r
5034 case IDEOGRAPHIC_NUMBER_ZERO_ :
\r
5035 case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
\r
5036 return 0; // Han Zero
\r
5037 case CJK_IDEOGRAPH_FIRST_ :
\r
5038 case CJK_IDEOGRAPH_COMPLEX_ONE_ :
\r
5039 return 1; // Han One
\r
5040 case CJK_IDEOGRAPH_SECOND_ :
\r
5041 case CJK_IDEOGRAPH_COMPLEX_TWO_ :
\r
5042 return 2; // Han Two
\r
5043 case CJK_IDEOGRAPH_THIRD_ :
\r
5044 case CJK_IDEOGRAPH_COMPLEX_THREE_ :
\r
5045 return 3; // Han Three
\r
5046 case CJK_IDEOGRAPH_FOURTH_ :
\r
5047 case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
\r
5048 return 4; // Han Four
\r
5049 case CJK_IDEOGRAPH_FIFTH_ :
\r
5050 case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
\r
5051 return 5; // Han Five
\r
5052 case CJK_IDEOGRAPH_SIXTH_ :
\r
5053 case CJK_IDEOGRAPH_COMPLEX_SIX_ :
\r
5054 return 6; // Han Six
\r
5055 case CJK_IDEOGRAPH_SEVENTH_ :
\r
5056 case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
\r
5057 return 7; // Han Seven
\r
5058 case CJK_IDEOGRAPH_EIGHTH_ :
\r
5059 case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
\r
5060 return 8; // Han Eight
\r
5061 case CJK_IDEOGRAPH_NINETH_ :
\r
5062 case CJK_IDEOGRAPH_COMPLEX_NINE_ :
\r
5063 return 9; // Han Nine
\r
5064 case CJK_IDEOGRAPH_TEN_ :
\r
5065 case CJK_IDEOGRAPH_COMPLEX_TEN_ :
\r
5067 case CJK_IDEOGRAPH_HUNDRED_ :
\r
5068 case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
\r
5070 case CJK_IDEOGRAPH_THOUSAND_ :
\r
5071 case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
\r
5073 case CJK_IDEOGRAPH_TEN_THOUSAND_ :
\r
5075 case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
\r
5078 return -1; // no value
\r
5082 * {@icu} <p>Returns an iterator for character types, iterating over codepoints.</p>
\r
5083 * Example of use:<br>
\r
5085 * RangeValueIterator iterator = UCharacter.getTypeIterator();
\r
5086 * RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
5087 * while (iterator.next(element)) {
\r
5088 * System.out.println("Codepoint \\u" +
\r
5089 * Integer.toHexString(element.start) +
\r
5090 * " to codepoint \\u" +
\r
5091 * Integer.toHexString(element.limit - 1) +
\r
5092 * " has the character type " +
\r
5096 * @return an iterator
\r
5099 public static RangeValueIterator getTypeIterator()
\r
5101 return new UCharacterTypeIterator(UCharacterProperty.INSTANCE);
\r
5105 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
\r
5106 * <p>This API only gets the iterator for the modern, most up-to-date
\r
5107 * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
\r
5108 * for extended names use getExtendedNameIterator().</p>
\r
5109 * Example of use:<br>
\r
5111 * ValueIterator iterator = UCharacter.getNameIterator();
\r
5112 * ValueIterator.Element element = new ValueIterator.Element();
\r
5113 * while (iterator.next(element)) {
\r
5114 * System.out.println("Codepoint \\u" +
\r
5115 * Integer.toHexString(element.codepoint) +
\r
5116 * " has the name " + (String)element.value);
\r
5119 * <p>The maximal range which the name iterator iterates is from
\r
5120 * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
\r
5121 * @return an iterator
\r
5124 public static ValueIterator getNameIterator(){
\r
5125 return new UCharacterNameIterator(UCharacterName.INSTANCE,
\r
5126 UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
5130 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
\r
5131 * <p>This API only gets the iterator for the older 1.0 Unicode names.
\r
5132 * For modern, most up-to-date Unicode names use getNameIterator() or
\r
5133 * for extended names use getExtendedNameIterator().</p>
\r
5134 * Example of use:<br>
\r
5136 * ValueIterator iterator = UCharacter.get1_0NameIterator();
\r
5137 * ValueIterator.Element element = new ValueIterator.Element();
\r
5138 * while (iterator.next(element)) {
\r
5139 * System.out.println("Codepoint \\u" +
\r
5140 * Integer.toHexString(element.codepoint) +
\r
5141 * " has the name " + (String)element.value);
\r
5144 * <p>The maximal range which the name iterator iterates is from
\r
5145 * @return an iterator
\r
5148 public static ValueIterator getName1_0Iterator(){
\r
5149 return new UCharacterNameIterator(UCharacterName.INSTANCE,
\r
5150 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
5154 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
\r
5155 * <p>This API only gets the iterator for the extended names.
\r
5156 * For modern, most up-to-date Unicode names use getNameIterator() or
\r
5157 * for older 1.0 Unicode names use get1_0NameIterator().</p>
\r
5158 * Example of use:<br>
\r
5160 * ValueIterator iterator = UCharacter.getExtendedNameIterator();
\r
5161 * ValueIterator.Element element = new ValueIterator.Element();
\r
5162 * while (iterator.next(element)) {
\r
5163 * System.out.println("Codepoint \\u" +
\r
5164 * Integer.toHexString(element.codepoint) +
\r
5165 * " has the name " + (String)element.value);
\r
5168 * <p>The maximal range which the name iterator iterates is from
\r
5169 * @return an iterator
\r
5172 public static ValueIterator getExtendedNameIterator(){
\r
5173 return new UCharacterNameIterator(UCharacterName.INSTANCE,
\r
5174 UCharacterNameChoice.EXTENDED_CHAR_NAME);
\r
5178 * {@icu} Returns the "age" of the code point.</p>
\r
5179 * <p>The "age" is the Unicode version when the code point was first
\r
5180 * designated (as a non-character or for Private Use) or assigned a
\r
5182 * <p>This can be useful to avoid emitting code points to receiving
\r
5183 * processes that do not accept newer characters.</p>
\r
5184 * <p>The data is from the UCD file DerivedAge.txt.</p>
\r
5185 * @param ch The code point.
\r
5186 * @return the Unicode version number
\r
5189 public static VersionInfo getAge(int ch)
\r
5191 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
5192 throw new IllegalArgumentException("Codepoint out of bounds");
\r
5194 return UCharacterProperty.INSTANCE.getAge(ch);
\r
5198 * {@icu} <p>Check a binary Unicode property for a code point.</p>
\r
5199 * <p>Unicode, especially in version 3.2, defines many more properties
\r
5200 * than the original set in UnicodeData.txt.</p>
\r
5201 * <p>This API is intended to reflect Unicode properties as defined in
\r
5202 * the Unicode Character Database (UCD) and Unicode Technical Reports
\r
5204 * <p>For details about the properties see
\r
5205 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
\r
5206 * <p>For names of Unicode properties see the UCD file
\r
5207 * PropertyAliases.txt.</p>
\r
5208 * <p>This API does not check the validity of the codepoint.</p>
\r
5209 * <p>Important: If ICU is built with UCD files from Unicode versions
\r
5210 * below 3.2, then properties marked with "new" are not or
\r
5211 * not fully available.</p>
\r
5212 * @param ch code point to test.
\r
5213 * @param property selector constant from com.ibm.icu.lang.UProperty,
\r
5214 * identifies which binary property to check.
\r
5215 * @return true or false according to the binary Unicode property value
\r
5216 * for ch. Also false if property is out of bounds or if the
\r
5217 * Unicode version does not have data for the property at all, or
\r
5218 * not for this code point.
\r
5219 * @see com.ibm.icu.lang.UProperty
\r
5222 public static boolean hasBinaryProperty(int ch, int property)
\r
5224 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
5225 throw new IllegalArgumentException("Codepoint out of bounds");
\r
5227 return UCharacterProperty.INSTANCE.hasBinaryProperty(ch, property);
\r
5231 * {@icu} <p>Check if a code point has the Alphabetic Unicode property.</p>
\r
5232 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
\r
5233 * <p>Different from UCharacter.isLetter(ch)!</p>
\r
5235 * @param ch codepoint to be tested
\r
5237 public static boolean isUAlphabetic(int ch)
\r
5239 return hasBinaryProperty(ch, UProperty.ALPHABETIC);
\r
5243 * {@icu} <p>Check if a code point has the Lowercase Unicode property.</p>
\r
5244 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
\r
5245 * <p>This is different from UCharacter.isLowerCase(ch)!</p>
\r
5246 * @param ch codepoint to be tested
\r
5249 public static boolean isULowercase(int ch)
\r
5251 return hasBinaryProperty(ch, UProperty.LOWERCASE);
\r
5255 * {@icu} <p>Check if a code point has the Uppercase Unicode property.</p>
\r
5256 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
\r
5257 * <p>This is different from UCharacter.isUpperCase(ch)!</p>
\r
5258 * @param ch codepoint to be tested
\r
5261 public static boolean isUUppercase(int ch)
\r
5263 return hasBinaryProperty(ch, UProperty.UPPERCASE);
\r
5267 * {@icu} <p>Check if a code point has the White_Space Unicode property.</p>
\r
5268 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
\r
5269 * <p>This is different from both UCharacter.isSpace(ch) and
\r
5270 * UCharacter.isWhitespace(ch)!</p>
\r
5271 * @param ch codepoint to be tested
\r
5274 public static boolean isUWhiteSpace(int ch)
\r
5276 return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
\r
5280 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
\r
5281 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
\r
5283 private static final int /* UHangulSyllableType */ gcbToHst[]={
\r
5284 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
\r
5285 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
\r
5286 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
\r
5287 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
\r
5288 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
\r
5289 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
\r
5290 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
\r
5291 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
\r
5292 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
\r
5293 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
\r
5295 * Omit GCB values beyond what we need for hst.
\r
5296 * The code below checks for the array length.
\r
5301 * {@icu} <p>Returns the property value for an Unicode property type of a code point.
\r
5302 * Also returns binary and mask property values.</p>
\r
5303 * <p>Unicode, especially in version 3.2, defines many more properties than
\r
5304 * the original set in UnicodeData.txt.</p>
\r
5305 * <p>The properties APIs are intended to reflect Unicode properties as
\r
5306 * defined in the Unicode Character Database (UCD) and Unicode Technical
\r
5307 * Reports (UTR). For details about the properties see
\r
5308 * http://www.unicode.org/.</p>
\r
5309 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
\r
5313 * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
\r
5314 * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
\r
5315 * boolean b = (ideo == 1) ? true : false;
\r
5317 * @param ch code point to test.
\r
5318 * @param type UProperty selector constant, identifies which binary
\r
5319 * property to check. Must be
\r
5320 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5321 * UProperty.INT_START <= type < UProperty.INT_LIMIT or
\r
5322 * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
\r
5323 * @return numeric value that is directly the property value or,
\r
5324 * for enumerated properties, corresponds to the numeric value of
\r
5325 * the enumerated constant of the respective property value
\r
5326 * enumeration type (cast to enum type if necessary).
\r
5327 * Returns 0 or 1 (for false / true) for binary Unicode properties.
\r
5328 * Returns a bit-mask for mask properties.
\r
5329 * Returns 0 if 'type' is out of bounds or if the Unicode version
\r
5330 * does not have data for the property at all, or not for this code
\r
5333 * @see #hasBinaryProperty
\r
5334 * @see #getIntPropertyMinValue
\r
5335 * @see #getIntPropertyMaxValue
\r
5336 * @see #getUnicodeVersion
\r
5339 public static int getIntPropertyValue(int ch, int type)
\r
5341 if (type < UProperty.BINARY_START) {
\r
5342 return 0; // undefined
\r
5344 else if (type < UProperty.BINARY_LIMIT) {
\r
5345 return hasBinaryProperty(ch, type) ? 1 : 0;
\r
5347 else if (type < UProperty.INT_START) {
\r
5348 return 0; // undefined
\r
5350 else if (type < UProperty.INT_LIMIT) {
\r
5352 case UProperty.BIDI_CLASS:
\r
5353 return getDirection(ch);
\r
5354 case UProperty.BLOCK:
\r
5355 return UnicodeBlock.idOf(ch);
\r
5356 case UProperty.CANONICAL_COMBINING_CLASS:
\r
5357 return getCombiningClass(ch);
\r
5358 case UProperty.DECOMPOSITION_TYPE:
\r
5359 return UCharacterProperty.INSTANCE.getAdditional(ch, 2)
\r
5360 & DECOMPOSITION_TYPE_MASK_;
\r
5361 case UProperty.EAST_ASIAN_WIDTH:
\r
5362 return (UCharacterProperty.INSTANCE.getAdditional(ch, 0)
\r
5363 & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
\r
5364 case UProperty.GENERAL_CATEGORY:
\r
5365 return getType(ch);
\r
5366 case UProperty.JOINING_GROUP:
\r
5367 return UBiDiProps.INSTANCE.getJoiningGroup(ch);
\r
5368 case UProperty.JOINING_TYPE:
\r
5369 return UBiDiProps.INSTANCE.getJoiningType(ch);
\r
5370 case UProperty.LINE_BREAK:
\r
5371 return (UCharacterProperty.INSTANCE
\r
5372 .getAdditional(ch, LB_VWORD)& LB_MASK)>>LB_SHIFT;
\r
5373 case UProperty.NUMERIC_TYPE:
\r
5374 return ntvGetType(getNumericTypeValue(UCharacterProperty
\r
5375 .INSTANCE.getProperty(ch)));
\r
5376 case UProperty.SCRIPT:
\r
5377 return UScript.getScript(ch);
\r
5378 case UProperty.HANGUL_SYLLABLE_TYPE: {
\r
5379 /* see comments on gcbToHst[] above */
\r
5380 int gcb=(UCharacterProperty.INSTANCE.getAdditional(ch, 2)&GCB_MASK)>>GCB_SHIFT;
\r
5381 if(gcb<gcbToHst.length) {
\r
5382 return gcbToHst[gcb];
\r
5384 return HangulSyllableType.NOT_APPLICABLE;
\r
5387 case UProperty.NFD_QUICK_CHECK:
\r
5388 case UProperty.NFKD_QUICK_CHECK:
\r
5389 case UProperty.NFC_QUICK_CHECK:
\r
5390 case UProperty.NFKC_QUICK_CHECK:
\r
5391 return Norm2AllModes.getN2WithImpl(type-UProperty.NFD_QUICK_CHECK).getQuickCheck(ch);
\r
5392 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
\r
5393 return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(ch)>>8;
\r
5394 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
\r
5395 return Norm2AllModes.getNFCInstance().impl.getFCDTrie().get(ch)&0xff;
\r
5396 case UProperty.GRAPHEME_CLUSTER_BREAK:
\r
5397 return (UCharacterProperty.INSTANCE.getAdditional(ch, 2)& GCB_MASK)>>GCB_SHIFT;
\r
5398 case UProperty.SENTENCE_BREAK:
\r
5399 return (UCharacterProperty.INSTANCE.getAdditional(ch, 2)& SB_MASK)>>SB_SHIFT;
\r
5400 case UProperty.WORD_BREAK:
\r
5401 return (UCharacterProperty.INSTANCE.getAdditional(ch, 2)& WB_MASK)>>WB_SHIFT;
\r
5402 /* Values were tested for variable type from Integer.MIN_VALUE
\r
5403 * to UProperty.INT_LIMIT and none would not reach the default case.
\r
5406 default: return 0; /* undefined */
\r
5409 } else if (type == UProperty.GENERAL_CATEGORY_MASK) {
\r
5410 return UCharacterProperty.getMask(getType(ch));
\r
5412 return 0; // undefined
\r
5415 * {@icu} Returns a string version of the property value.
\r
5416 * @param propertyEnum The property enum value.
\r
5417 * @param codepoint The codepoint value.
\r
5418 * @param nameChoice The choice of the name.
\r
5419 * @return value as string
\r
5421 * @deprecated This API is ICU internal only.
\r
5424 public static String getStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) {
\r
5425 if ((propertyEnum >= UProperty.BINARY_START && propertyEnum < UProperty.BINARY_LIMIT) ||
\r
5426 (propertyEnum >= UProperty.INT_START && propertyEnum < UProperty.INT_LIMIT)) {
\r
5427 return getPropertyValueName(propertyEnum, getIntPropertyValue(codepoint, propertyEnum),
\r
5430 if (propertyEnum == UProperty.NUMERIC_VALUE) {
\r
5431 return String.valueOf(getUnicodeNumericValue(codepoint));
\r
5433 // otherwise must be string property
\r
5434 switch (propertyEnum) {
\r
5435 case UProperty.AGE: return getAge(codepoint).toString();
\r
5436 case UProperty.ISO_COMMENT: return getISOComment(codepoint);
\r
5437 case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(getMirror(codepoint));
\r
5438 case UProperty.CASE_FOLDING: return foldCase(UTF16.valueOf(codepoint), true);
\r
5439 case UProperty.LOWERCASE_MAPPING: return toLowerCase(UTF16.valueOf(codepoint));
\r
5440 case UProperty.NAME: return getName(codepoint);
\r
5441 case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(foldCase(codepoint,true));
\r
5442 case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(toLowerCase(codepoint));
\r
5443 case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(toTitleCase(codepoint));
\r
5444 case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(toUpperCase(codepoint));
\r
5445 case UProperty.TITLECASE_MAPPING: return toTitleCase(UTF16.valueOf(codepoint),null);
\r
5446 case UProperty.UNICODE_1_NAME: return getName1_0(codepoint);
\r
5447 case UProperty.UPPERCASE_MAPPING: return toUpperCase(UTF16.valueOf(codepoint));
\r
5449 throw new IllegalArgumentException("Illegal Property Enum");
\r
5454 * {@icu} Returns the minimum value for an integer/binary Unicode property type.
\r
5455 * Can be used together with UCharacter.getIntPropertyMaxValue(int)
\r
5456 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
\r
5457 * @param type UProperty selector constant, identifies which binary
\r
5458 * property to check. Must be
\r
5459 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5460 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
\r
5461 * @return Minimum value returned by UCharacter.getIntPropertyValue(int)
\r
5462 * for a Unicode property. 0 if the property
\r
5463 * selector 'type' is out of range.
\r
5465 * @see #hasBinaryProperty
\r
5466 * @see #getUnicodeVersion
\r
5467 * @see #getIntPropertyMaxValue
\r
5468 * @see #getIntPropertyValue
\r
5471 public static int getIntPropertyMinValue(int type){
\r
5473 return 0; // undefined; and: all other properties have a minimum value of 0
\r
5478 * {@icu} Returns the maximum value for an integer/binary Unicode property.
\r
5479 * Can be used together with UCharacter.getIntPropertyMinValue(int)
\r
5480 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
\r
5481 * Examples for min/max values (for Unicode 3.2):
\r
5483 * <li> UProperty.BIDI_CLASS: 0/18
\r
5484 * (UCharacterDirection.LEFT_TO_RIGHT/UCharacterDirection.BOUNDARY_NEUTRAL)
\r
5485 * <li> UProperty.SCRIPT: 0/45 (UScript.COMMON/UScript.TAGBANWA)
\r
5486 * <li> UProperty.IDEOGRAPHIC: 0/1 (false/true)
\r
5488 * For undefined UProperty constant values, min/max values will be 0/-1.
\r
5489 * @param type UProperty selector constant, identifies which binary
\r
5490 * property to check. Must be
\r
5491 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5492 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
\r
5493 * @return Maximum value returned by u_getIntPropertyValue for a Unicode
\r
5494 * property. <= 0 if the property selector 'type' is out of range.
\r
5496 * @see #hasBinaryProperty
\r
5497 * @see #getUnicodeVersion
\r
5498 * @see #getIntPropertyMaxValue
\r
5499 * @see #getIntPropertyValue
\r
5502 public static int getIntPropertyMaxValue(int type)
\r
5504 if (type < UProperty.BINARY_START) {
\r
5505 return -1; // undefined
\r
5507 else if (type < UProperty.BINARY_LIMIT) {
\r
5508 return 1; // maximum TRUE for all binary properties
\r
5510 else if (type < UProperty.INT_START) {
\r
5511 return -1; // undefined
\r
5513 else if (type < UProperty.INT_LIMIT) {
\r
5515 case UProperty.BIDI_CLASS:
\r
5516 case UProperty.JOINING_GROUP:
\r
5517 case UProperty.JOINING_TYPE:
\r
5518 return UBiDiProps.INSTANCE.getMaxValue(type);
\r
5519 case UProperty.BLOCK:
\r
5520 return (UCharacterProperty.INSTANCE.getMaxValues(0) & BLOCK_MASK_)
\r
5522 case UProperty.CANONICAL_COMBINING_CLASS:
\r
5523 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
\r
5524 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
\r
5525 return 0xff; // TODO do we need to be more precise,
\r
5526 // getting the actual maximum?
\r
5527 case UProperty.DECOMPOSITION_TYPE:
\r
5528 return UCharacterProperty.INSTANCE.getMaxValues(2) & DECOMPOSITION_TYPE_MASK_;
\r
5529 case UProperty.EAST_ASIAN_WIDTH:
\r
5530 return (UCharacterProperty.INSTANCE.getMaxValues(0) & EAST_ASIAN_MASK_)
\r
5531 >> EAST_ASIAN_SHIFT_;
\r
5532 case UProperty.GENERAL_CATEGORY:
\r
5533 return UCharacterCategory.CHAR_CATEGORY_COUNT - 1;
\r
5534 case UProperty.LINE_BREAK:
\r
5535 return (UCharacterProperty.INSTANCE.getMaxValues(LB_VWORD) & LB_MASK)
\r
5537 case UProperty.NUMERIC_TYPE:
\r
5538 return NumericType.COUNT - 1;
\r
5539 case UProperty.SCRIPT:
\r
5540 return UCharacterProperty.INSTANCE.getMaxValues(0) & SCRIPT_MASK_;
\r
5541 case UProperty.HANGUL_SYLLABLE_TYPE:
\r
5542 return HangulSyllableType.COUNT-1;
\r
5543 case UProperty.NFD_QUICK_CHECK:
\r
5544 case UProperty.NFKD_QUICK_CHECK:
\r
5545 return 1; // YES -- these are never "maybe", only "no" or "yes"
\r
5546 case UProperty.NFC_QUICK_CHECK:
\r
5547 case UProperty.NFKC_QUICK_CHECK:
\r
5548 return 2; // MAYBE
\r
5549 case UProperty.GRAPHEME_CLUSTER_BREAK:
\r
5550 return (UCharacterProperty.INSTANCE.getMaxValues(2) & GCB_MASK) >> GCB_SHIFT;
\r
5551 case UProperty.SENTENCE_BREAK:
\r
5552 return (UCharacterProperty.INSTANCE.getMaxValues(2) & SB_MASK) >> SB_SHIFT;
\r
5553 case UProperty.WORD_BREAK:
\r
5554 return (UCharacterProperty.INSTANCE.getMaxValues(2) & WB_MASK) >> WB_SHIFT;
\r
5555 /* Values were tested for variable type from Integer.MIN_VALUE
\r
5556 * to UProperty.INT_LIMIT and none would not reach the default case.
\r
5559 default: return -1; // undefined
\r
5563 return -1; // undefined
\r
5567 * Provide the java.lang.Character forDigit API, for convenience.
\r
5570 public static char forDigit(int digit, int radix) {
\r
5571 return java.lang.Character.forDigit(digit, radix);
\r
5574 // JDK 1.5 API coverage
\r
5577 * Cover the JDK 1.5 API, for convenience.
\r
5578 * @see UTF16#LEAD_SURROGATE_MIN_VALUE
\r
5581 public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
\r
5584 * Cover the JDK 1.5 API, for convenience.
\r
5585 * @see UTF16#LEAD_SURROGATE_MAX_VALUE
\r
5588 public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
\r
5591 * Cover the JDK 1.5 API, for convenience.
\r
5592 * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
\r
5595 public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
\r
5598 * Cover the JDK 1.5 API, for convenience.
\r
5599 * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
\r
5602 public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
\r
5605 * Cover the JDK 1.5 API, for convenience.
\r
5606 * @see UTF16#SURROGATE_MIN_VALUE
\r
5609 public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
\r
5612 * Cover the JDK 1.5 API, for convenience.
\r
5613 * @see UTF16#SURROGATE_MAX_VALUE
\r
5616 public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
\r
5619 * Cover the JDK 1.5 API, for convenience.
\r
5620 * @see UTF16#SUPPLEMENTARY_MIN_VALUE
\r
5623 public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
5626 * Cover the JDK 1.5 API, for convenience.
\r
5627 * @see UTF16#CODEPOINT_MAX_VALUE
\r
5630 public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
\r
5633 * Cover the JDK 1.5 API, for convenience.
\r
5634 * @see UTF16#CODEPOINT_MIN_VALUE
\r
5637 public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
\r
5640 * Cover the JDK 1.5 API, for convenience.
\r
5641 * @param cp the code point to check
\r
5642 * @return true if cp is a valid code point
\r
5645 public static final boolean isValidCodePoint(int cp) {
\r
5646 return cp >= 0 && cp <= MAX_CODE_POINT;
\r
5650 * Cover the JDK 1.5 API, for convenience.
\r
5651 * @param cp the code point to check
\r
5652 * @return true if cp is a supplementary code point
\r
5655 public static final boolean isSupplementaryCodePoint(int cp) {
\r
5656 return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
\r
5657 && cp <= UTF16.CODEPOINT_MAX_VALUE;
\r
5661 * Cover the JDK 1.5 API, for convenience.
\r
5662 * @param ch the char to check
\r
5663 * @return true if ch is a high (lead) surrogate
\r
5666 public static boolean isHighSurrogate(char ch) {
\r
5667 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
\r
5671 * Cover the JDK 1.5 API, for convenience.
\r
5672 * @param ch the char to check
\r
5673 * @return true if ch is a low (trail) surrogate
\r
5676 public static boolean isLowSurrogate(char ch) {
\r
5677 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
\r
5681 * Cover the JDK 1.5 API, for convenience. Return true if the chars
\r
5682 * form a valid surrogate pair.
\r
5683 * @param high the high (lead) char
\r
5684 * @param low the low (trail) char
\r
5685 * @return true if high, low form a surrogate pair
\r
5688 public static final boolean isSurrogatePair(char high, char low) {
\r
5689 return isHighSurrogate(high) && isLowSurrogate(low);
\r
5693 * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
\r
5694 * to represent the code point. This does not check the
\r
5695 * code point for validity.
\r
5696 * @param cp the code point to check
\r
5697 * @return the number of chars needed to represent the code point
\r
5698 * @see UTF16#getCharCount
\r
5701 public static int charCount(int cp) {
\r
5702 return UTF16.getCharCount(cp);
\r
5706 * Cover the JDK 1.5 API, for convenience. Return the code point represented by
\r
5707 * the characters. This does not check the surrogate pair for validity.
\r
5708 * @param high the high (lead) surrogate
\r
5709 * @param low the low (trail) surrogate
\r
5710 * @return the code point formed by the surrogate pair
\r
5713 public static final int toCodePoint(char high, char low) {
\r
5714 return UCharacterProperty.getRawSupplementary(high, low);
\r
5718 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5719 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5720 * API. This examines only the characters at index and index+1.
\r
5721 * @param seq the characters to check
\r
5722 * @param index the index of the first or only char forming the code point
\r
5723 * @return the code point at the index
\r
5726 public static final int codePointAt(CharSequence seq, int index) {
\r
5727 char c1 = seq.charAt(index++);
\r
5728 if (isHighSurrogate(c1)) {
\r
5729 if (index < seq.length()) {
\r
5730 char c2 = seq.charAt(index);
\r
5731 if (isLowSurrogate(c2)) {
\r
5732 return toCodePoint(c1, c2);
\r
5740 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5741 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5742 * API. This examines only the characters at index and index+1.
\r
5743 * @param text the characters to check
\r
5744 * @param index the index of the first or only char forming the code point
\r
5745 * @return the code point at the index
\r
5748 public static final int codePointAt(char[] text, int index) {
\r
5749 char c1 = text[index++];
\r
5750 if (isHighSurrogate(c1)) {
\r
5751 if (index < text.length) {
\r
5752 char c2 = text[index];
\r
5753 if (isLowSurrogate(c2)) {
\r
5754 return toCodePoint(c1, c2);
\r
5762 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5763 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5764 * API. This examines only the characters at index and index+1.
\r
5765 * @param text the characters to check
\r
5766 * @param index the index of the first or only char forming the code point
\r
5767 * @param limit the limit of the valid text
\r
5768 * @return the code point at the index
\r
5771 public static final int codePointAt(char[] text, int index, int limit) {
\r
5772 if (index >= limit || limit > text.length) {
\r
5773 throw new IndexOutOfBoundsException();
\r
5775 char c1 = text[index++];
\r
5776 if (isHighSurrogate(c1)) {
\r
5777 if (index < limit) {
\r
5778 char c2 = text[index];
\r
5779 if (isLowSurrogate(c2)) {
\r
5780 return toCodePoint(c1, c2);
\r
5788 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5789 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5790 * API. This examines only the characters at index-1 and index-2.
\r
5791 * @param seq the characters to check
\r
5792 * @param index the index after the last or only char forming the code point
\r
5793 * @return the code point before the index
\r
5796 public static final int codePointBefore(CharSequence seq, int index) {
\r
5797 char c2 = seq.charAt(--index);
\r
5798 if (isLowSurrogate(c2)) {
\r
5800 char c1 = seq.charAt(--index);
\r
5801 if (isHighSurrogate(c1)) {
\r
5802 return toCodePoint(c1, c2);
\r
5810 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5811 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5812 * API. This examines only the characters at index-1 and index-2.
\r
5813 * @param text the characters to check
\r
5814 * @param index the index after the last or only char forming the code point
\r
5815 * @return the code point before the index
\r
5818 public static final int codePointBefore(char[] text, int index) {
\r
5819 char c2 = text[--index];
\r
5820 if (isLowSurrogate(c2)) {
\r
5822 char c1 = text[--index];
\r
5823 if (isHighSurrogate(c1)) {
\r
5824 return toCodePoint(c1, c2);
\r
5832 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5833 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5834 * API. This examines only the characters at index-1 and index-2.
\r
5835 * @param text the characters to check
\r
5836 * @param index the index after the last or only char forming the code point
\r
5837 * @param limit the start of the valid text
\r
5838 * @return the code point before the index
\r
5841 public static final int codePointBefore(char[] text, int index, int limit) {
\r
5842 if (index <= limit || limit < 0) {
\r
5843 throw new IndexOutOfBoundsException();
\r
5845 char c2 = text[--index];
\r
5846 if (isLowSurrogate(c2)) {
\r
5847 if (index > limit) {
\r
5848 char c1 = text[--index];
\r
5849 if (isHighSurrogate(c1)) {
\r
5850 return toCodePoint(c1, c2);
\r
5858 * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
\r
5859 * code point into the destination at the given index.
\r
5860 * @param cp the code point to convert
\r
5861 * @param dst the destination array into which to put the char(s) representing the code point
\r
5862 * @param dstIndex the index at which to put the first (or only) char
\r
5863 * @return the count of the number of chars written (1 or 2)
\r
5864 * @throws IllegalArgumentException if cp is not a valid code point
\r
5867 public static final int toChars(int cp, char[] dst, int dstIndex) {
\r
5869 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
\r
5870 dst[dstIndex] = (char)cp;
\r
5873 if (cp <= MAX_CODE_POINT) {
\r
5874 dst[dstIndex] = UTF16.getLeadSurrogate(cp);
\r
5875 dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
\r
5879 throw new IllegalArgumentException();
\r
5883 * Cover the JDK 1.5 API, for convenience. Returns a char array
\r
5884 * representing the code point.
\r
5885 * @param cp the code point to convert
\r
5886 * @return an array containing the char(s) representing the code point
\r
5887 * @throws IllegalArgumentException if cp is not a valid code point
\r
5890 public static final char[] toChars(int cp) {
\r
5892 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
\r
5893 return new char[] { (char)cp };
\r
5895 if (cp <= MAX_CODE_POINT) {
\r
5896 return new char[] {
\r
5897 UTF16.getLeadSurrogate(cp),
\r
5898 UTF16.getTrailSurrogate(cp)
\r
5902 throw new IllegalArgumentException();
\r
5906 * Cover the JDK API, for convenience. Return a byte representing the directionality of
\r
5909 * {@icunote} Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined
\r
5910 * or out-of-bounds characters.
\r
5912 * {@icunote} The return value must be tested using the constants defined in {@link
\r
5913 * UCharacterEnums.ECharacterDirection} since the values are different from the ones
\r
5914 * defined by <code>java.lang.Character</code>.
\r
5915 * @param cp the code point to check
\r
5916 * @return the directionality of the code point
\r
5917 * @see #getDirection
\r
5920 public static byte getDirectionality(int cp)
\r
5922 return (byte)getDirection(cp);
\r
5926 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
\r
5927 * @param text the characters to check
\r
5928 * @param start the start of the range
\r
5929 * @param limit the limit of the range
\r
5930 * @return the number of code points in the range
\r
5933 public static int codePointCount(CharSequence text, int start, int limit) {
\r
5934 if (start < 0 || limit < start || limit > text.length()) {
\r
5935 throw new IndexOutOfBoundsException("start (" + start +
\r
5936 ") or limit (" + limit +
\r
5937 ") invalid or out of range 0, " + text.length());
\r
5940 int len = limit - start;
\r
5941 while (limit > start) {
\r
5942 char ch = text.charAt(--limit);
\r
5943 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
\r
5944 ch = text.charAt(--limit);
\r
5945 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
\r
5955 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
\r
5956 * @param text the characters to check
\r
5957 * @param start the start of the range
\r
5958 * @param limit the limit of the range
\r
5959 * @return the number of code points in the range
\r
5962 public static int codePointCount(char[] text, int start, int limit) {
\r
5963 if (start < 0 || limit < start || limit > text.length) {
\r
5964 throw new IndexOutOfBoundsException("start (" + start +
\r
5965 ") or limit (" + limit +
\r
5966 ") invalid or out of range 0, " + text.length);
\r
5969 int len = limit - start;
\r
5970 while (limit > start) {
\r
5971 char ch = text[--limit];
\r
5972 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
\r
5973 ch = text[--limit];
\r
5974 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
\r
5984 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
\r
5985 * @param text the characters to check
\r
5986 * @param index the index to adjust
\r
5987 * @param codePointOffset the number of code points by which to offset the index
\r
5988 * @return the adjusted index
\r
5991 public static int offsetByCodePoints(CharSequence text, int index, int codePointOffset) {
\r
5992 if (index < 0 || index > text.length()) {
\r
5993 throw new IndexOutOfBoundsException("index ( " + index +
\r
5994 ") out of range 0, " + text.length());
\r
5997 if (codePointOffset < 0) {
\r
5998 while (++codePointOffset <= 0) {
\r
5999 char ch = text.charAt(--index);
\r
6000 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
\r
6001 ch = text.charAt(--index);
\r
6002 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
\r
6003 if (++codePointOffset > 0) {
\r
6010 int limit = text.length();
\r
6011 while (--codePointOffset >= 0) {
\r
6012 char ch = text.charAt(index++);
\r
6013 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
\r
6014 ch = text.charAt(index++);
\r
6015 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
\r
6016 if (--codePointOffset < 0) {
\r
6028 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
\r
6029 * @param text the characters to check
\r
6030 * @param start the start of the range to check
\r
6031 * @param count the length of the range to check
\r
6032 * @param index the index to adjust
\r
6033 * @param codePointOffset the number of code points by which to offset the index
\r
6034 * @return the adjusted index
\r
6037 public static int offsetByCodePoints(char[] text, int start, int count, int index,
\r
6038 int codePointOffset) {
\r
6039 int limit = start + count;
\r
6040 if (start < 0 || limit < start || limit > text.length || index < start || index > limit) {
\r
6041 throw new IndexOutOfBoundsException("index ( " + index +
\r
6042 ") out of range " + start +
\r
6044 " in array 0, " + text.length);
\r
6047 if (codePointOffset < 0) {
\r
6048 while (++codePointOffset <= 0) {
\r
6049 char ch = text[--index];
\r
6050 if (index < start) {
\r
6051 throw new IndexOutOfBoundsException("index ( " + index +
\r
6052 ") < start (" + start +
\r
6055 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > start) {
\r
6056 ch = text[--index];
\r
6057 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
\r
6058 if (++codePointOffset > 0) {
\r
6065 while (--codePointOffset >= 0) {
\r
6066 char ch = text[index++];
\r
6067 if (index > limit) {
\r
6068 throw new IndexOutOfBoundsException("index ( " + index +
\r
6069 ") > limit (" + limit +
\r
6072 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
\r
6073 ch = text[index++];
\r
6074 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
\r
6075 if (--codePointOffset < 0) {
\r
6086 // private variables -------------------------------------------------
\r
6089 * To get the last character out from a data type
\r
6091 private static final int LAST_CHAR_MASK_ = 0xFFFF;
\r
6094 // * To get the last byte out from a data type
\r
6096 // private static final int LAST_BYTE_MASK_ = 0xFF;
\r
6099 // * Shift 16 bits
\r
6101 // private static final int SHIFT_16_ = 16;
\r
6104 // * Shift 24 bits
\r
6106 // private static final int SHIFT_24_ = 24;
\r
6109 // * Decimal radix
\r
6111 // private static final int DECIMAL_RADIX_ = 10;
\r
6114 * No break space code point
\r
6116 private static final int NO_BREAK_SPACE_ = 0xA0;
\r
6119 * Figure space code point
\r
6121 private static final int FIGURE_SPACE_ = 0x2007;
\r
6124 * Narrow no break space code point
\r
6126 private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
\r
6129 * Ideographic number zero code point
\r
6131 private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
\r
6134 * CJK Ideograph, First code point
\r
6136 private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
\r
6139 * CJK Ideograph, Second code point
\r
6141 private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
\r
6144 * CJK Ideograph, Third code point
\r
6146 private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
\r
6149 * CJK Ideograph, Fourth code point
\r
6151 private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
\r
6154 * CJK Ideograph, FIFTH code point
\r
6156 private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
\r
6159 * CJK Ideograph, Sixth code point
\r
6161 private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
\r
6164 * CJK Ideograph, Seventh code point
\r
6166 private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
\r
6169 * CJK Ideograph, Eighth code point
\r
6171 private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
\r
6174 * CJK Ideograph, Nineth code point
\r
6176 private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
\r
6179 * Application Program command code point
\r
6181 private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
\r
6184 * Unit separator code point
\r
6186 private static final int UNIT_SEPARATOR_ = 0x001F;
\r
6189 * Delete code point
\r
6191 private static final int DELETE_ = 0x007F;
\r
6193 * Numeric types and values in the main properties words.
\r
6195 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
\r
6196 private static final int getNumericTypeValue(int props) {
\r
6197 return props >> NUMERIC_TYPE_VALUE_SHIFT_;
\r
6199 /* constants for the storage form of numeric types and values */
\r
6200 private static final int NTV_NONE_ = 0;
\r
6201 private static final int NTV_DECIMAL_START_ = 1;
\r
6202 private static final int NTV_DIGIT_START_ = 11;
\r
6203 private static final int NTV_NUMERIC_START_ = 21;
\r
6204 private static final int NTV_FRACTION_START_ = 0xb0;
\r
6205 private static final int NTV_LARGE_START_ = 0x1e0;
\r
6206 private static final int NTV_RESERVED_START_ = 0x300;
\r
6208 private static final int ntvGetType(int ntv) {
\r
6210 (ntv==NTV_NONE_) ? NumericType.NONE :
\r
6211 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
\r
6212 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
\r
6213 NumericType.NUMERIC;
\r
6217 * Han digit characters
\r
6219 private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
\r
6220 private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
\r
6221 private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
\r
6222 private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
\r
6223 private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
\r
6224 private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
\r
6225 private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
\r
6226 private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
\r
6227 private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
\r
6228 private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
\r
6229 private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
\r
6230 private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
\r
6231 private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
\r
6232 private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
\r
6233 private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
\r
6234 private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
\r
6235 private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
\r
6236 private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
\r
6239 // * Zero Width Non Joiner.
\r
6240 // * Equivalent to icu4c ZWNJ.
\r
6242 // private static final int ZERO_WIDTH_NON_JOINER_ = 0x200c;
\r
6244 // * Zero Width Joiner
\r
6245 // * Equivalent to icu4c ZWJ.
\r
6247 // private static final int ZERO_WIDTH_JOINER_ = 0x200d;
\r
6250 * Properties in vector word 2
\r
6253 * 25..20 Line Break
\r
6254 * 19..15 Sentence Break
\r
6255 * 14..10 Word Break
\r
6256 * 9.. 5 Grapheme Cluster Break
\r
6257 * 4.. 0 Decomposition Type
\r
6259 private static final int LB_MASK = 0x03f00000;
\r
6260 private static final int LB_SHIFT = 20;
\r
6261 private static final int LB_VWORD = 2;
\r
6263 private static final int SB_MASK = 0x000f8000;
\r
6264 private static final int SB_SHIFT = 15;
\r
6266 private static final int WB_MASK = 0x00007c00;
\r
6267 private static final int WB_SHIFT = 10;
\r
6269 private static final int GCB_MASK = 0x000003e0;
\r
6270 private static final int GCB_SHIFT = 5;
\r
6273 * Integer properties mask for decomposition type.
\r
6274 * Equivalent to icu4c UPROPS_DT_MASK.
\r
6276 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
\r
6279 * Properties in vector word 0
\r
6281 * 31..24 DerivedAge version major/minor one nibble each
\r
6283 * 19..17 East Asian Width
\r
6284 * 16.. 8 UBlockCode
\r
6285 * 7.. 0 UScriptCode
\r
6289 * Integer properties mask and shift values for East Asian cell width.
\r
6290 * Equivalent to icu4c UPROPS_EA_MASK
\r
6292 private static final int EAST_ASIAN_MASK_ = 0x000e0000;
\r
6294 * Integer properties mask and shift values for East Asian cell width.
\r
6295 * Equivalent to icu4c UPROPS_EA_SHIFT
\r
6297 private static final int EAST_ASIAN_SHIFT_ = 17;
\r
6299 * Integer properties mask and shift values for blocks.
\r
6300 * Equivalent to icu4c UPROPS_BLOCK_MASK
\r
6302 private static final int BLOCK_MASK_ = 0x0001ff00;
\r
6304 * Integer properties mask and shift values for blocks.
\r
6305 * Equivalent to icu4c UPROPS_BLOCK_SHIFT
\r
6307 private static final int BLOCK_SHIFT_ = 8;
\r
6309 * Integer properties mask and shift values for scripts.
\r
6310 * Equivalent to icu4c UPROPS_SHIFT_MASK
\r
6312 static final int SCRIPT_MASK_ = 0x000000ff;
\r
6314 // private constructor -----------------------------------------------
\r
6317 * Private constructor to prevent instantiation
\r
6319 private UCharacter()
\r
6323 // private methods ---------------------------------------------------
\r
6326 * Returns the digit values of characters like 'A' - 'Z', normal,
\r
6327 * half-width and full-width. This method assumes that the other digit
\r
6328 * characters are checked by the calling method.
\r
6329 * @param ch character to test
\r
6330 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
\r
6331 * its corresponding digit will be returned.
\r
6333 private static int getEuropeanDigit(int ch) {
\r
6334 if ((ch > 0x7a && ch < 0xff21)
\r
6335 || ch < 0x41 || (ch > 0x5a && ch < 0x61)
\r
6336 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
\r
6340 // ch >= 0x41 or ch < 0x61
\r
6341 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
\r
6344 if (ch <= 0xff3a) {
\r
6345 return ch + 10 - 0xff21;
\r
6347 // ch >= 0xff41 && ch <= 0xff5a
\r
6348 return ch + 10 - 0xff41;
\r
6352 * Returns the property value at the index.
\r
6353 * This is optimized.
\r
6354 * Note this is alittle different from CharTrie the index m_trieData_
\r
6355 * is never negative.
\r
6356 * This is a duplicate of UCharacterProperty.getProperty. For optimization
\r
6357 * purposes, this method calls the trie data directly instead of through
\r
6358 * UCharacterProperty.getProperty.
\r
6359 * @param ch code point whose property value is to be retrieved
\r
6360 * @return property value of code point
\r
6363 private static final int getProperty(int ch)
\r
6365 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
\r
6366 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
\r
6367 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
\r
6368 // BMP codepoint 0000..D7FF or DC00..FFFF
\r
6369 try { // using try for ch < 0 is faster than using an if statement
\r
6370 return UCharacterProperty.INSTANCE.m_trieData_[
\r
6371 (UCharacterProperty.INSTANCE.m_trieIndex_[ch >> 5] << 2)
\r
6373 } catch (ArrayIndexOutOfBoundsException e) {
\r
6374 // TODO: Tested all the values from 0 ... UTF16.LEAD_SURROGATE_MIN_VALUE
\r
6375 // and UTF16.LEAD_SURROGATE_MAX_VALUE ... UTF16.SUPPLEMENTARY_MIN_VALUE
\r
6376 // but it never results into the catch section of the try-catch
\r
6378 return UCharacterProperty.INSTANCE.m_trieInitialValue_;
\r
6382 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
6383 // lead surrogate D800..DBFF
\r
6384 return UCharacterProperty.INSTANCE.m_trieData_[
\r
6385 (UCharacterProperty.INSTANCE.m_trieIndex_[(0x2800 >> 5) +
\r
6389 // for optimization
\r
6390 if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
\r
6391 // supplementary code point 10000..10FFFF
\r
6392 // look at the construction of supplementary characters
\r
6393 // trail forms the ends of it.
\r
6394 return UCharacterProperty.INSTANCE.m_trie_.getSurrogateValue(
\r
6395 UTF16.getLeadSurrogate(ch),
\r
6396 (char)(ch & 0x3ff));
\r
6398 // return m_dataOffset_ if there is an error, in this case we return
\r
6399 // the default value: m_initialValue_
\r
6400 // we cannot assume that m_initialValue_ is at offset 0
\r
6401 // this is for optimization.
\r
6402 return UCharacterProperty.INSTANCE.m_trieInitialValue_;
\r