3 *******************************************************************************
4 * Copyright (C) 1996-2011, International Business Machines Corporation and *
5 * others. All Rights Reserved. *
6 *******************************************************************************
9 package com.ibm.icu.lang;
11 import java.lang.ref.SoftReference;
12 import java.util.HashMap;
13 import java.util.Iterator;
14 import java.util.Locale;
17 import com.ibm.icu.impl.IllegalIcuArgumentException;
18 import com.ibm.icu.impl.Norm2AllModes;
19 import com.ibm.icu.impl.Normalizer2Impl;
20 import com.ibm.icu.impl.Trie2;
21 import com.ibm.icu.impl.UBiDiProps;
22 import com.ibm.icu.impl.UCaseProps;
23 import com.ibm.icu.impl.UCharacterName;
24 import com.ibm.icu.impl.UCharacterNameChoice;
25 import com.ibm.icu.impl.UCharacterProperty;
26 import com.ibm.icu.impl.UCharacterUtility;
27 import com.ibm.icu.impl.UPropertyAliases;
28 import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
29 import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection;
30 import com.ibm.icu.text.BreakIterator;
31 import com.ibm.icu.text.UTF16;
32 import com.ibm.icu.util.RangeValueIterator;
33 import com.ibm.icu.util.ULocale;
34 import com.ibm.icu.util.ValueIterator;
35 import com.ibm.icu.util.VersionInfo;
38 * {@icuenhanced java.lang.Character}.{@icu _usage_}
40 * <p>The UCharacter class provides extensions to the
41 * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
42 * java.lang.Character</a> class. These extensions provide support for
43 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
44 * class, provide support for supplementary characters (those with code
45 * points above U+FFFF).
46 * Each ICU release supports the latest version of Unicode available at that time.
48 * <p>Code points are represented in these API using ints. While it would be
49 * more convenient in Java to have a separate primitive datatype for them,
50 * ints suffice in the meantime.
52 * <p>To use this class please add the jar file name icu4j.jar to the
53 * class path, since it contains data files which supply the information used
55 * E.g. In Windows <br>
56 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
57 * Otherwise, another method would be to copy the files uprops.dat and
58 * unames.icu from the icu4j source subdirectory
59 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
60 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
62 * <p>Aside from the additions for UTF-16 support, and the updated Unicode
63 * properties, the main differences between UCharacter and Character are:
65 * <li> UCharacter is not designed to be a char wrapper and does not have
66 * APIs to which involves management of that single char.<br>
69 * <li> char charValue(),
70 * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
72 * <li> UCharacter does not include Character APIs that are deprecated, nor
73 * does it include the Java-specific character information, such as
74 * boolean isJavaIdentifierPart(char ch).
75 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
76 * values '10' - '35'. UCharacter also does this in digit and
77 * getNumericValue, to adhere to the java semantics of these
78 * methods. New methods unicodeDigit, and
79 * getUnicodeNumericValue do not treat the above code points
80 * as having numeric values. This is a semantic change from ICU4J 1.3.1.
83 * Further detail on differences can be determined using the program
85 * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
86 * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
89 * In addition to Java compatibility functions, which calculate derived properties,
90 * this API provides low-level access to the Unicode Character Database.
93 * Unicode assigns each code point (not just assigned character) values for
95 * Most of them are simple boolean flags, or constants from a small enumerated list.
96 * For some properties, values are strings or other relatively more complex types.
99 * For more information see
100 * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
101 * (http://www.unicode.org/ucd/)
102 * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
103 * User Guide chapter on Properties</a>
104 * (http://www.icu-project.org/userguide/properties.html).
107 * There are also functions that provide easy migration from C/POSIX functions
108 * like isblank(). Their use is generally discouraged because the C/POSIX
109 * standards do not define their semantics beyond the ASCII range, which means
110 * that different implementations exhibit very different behavior.
111 * Instead, Unicode properties should be used directly.
114 * There are also only a few, broad C/POSIX character classes, and they tend
115 * to be used for conflicting purposes. For example, the "isalpha()" class
116 * is sometimes used to determine word boundaries, while a more sophisticated
117 * approach would at least distinguish initial letters from continuation
118 * characters (the latter including combining marks).
119 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
120 * Another example: There is no "istitle()" class for titlecase characters.
123 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
124 * ICU implements them according to the Standard Recommendations in
125 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
126 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
129 * API access for C/POSIX character classes is as follows:
131 * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
132 * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
133 * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
134 * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
135 * (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
136 * (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
137 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
138 * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
139 * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
140 * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
141 * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
142 * - cntrl: getType(c)==CONTROL
143 * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
144 * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)}</pre>
147 * The C/POSIX character classes are also available in UnicodeSet patterns,
148 * using patterns like [:graph:] or \p{graph}.
151 * {@icunote} There are several ICU (and Java) whitespace functions.
153 * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
154 * most of general categories "Z" (separators) + most whitespace ISO controls
155 * (including no-break spaces, but excluding IS1..IS4 and ZWSP)
156 * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
157 * <li> isSpaceChar: just Z (including no-break spaces)</ul>
160 * This class is not subclassable.
162 * @author Syn Wee Quek
164 * @see com.ibm.icu.lang.UCharacterEnums
167 public final class UCharacter implements ECharacterCategory, ECharacterDirection
169 // public inner classes ----------------------------------------------
172 * {@icuenhanced java.lang.Character.UnicodeBlock}.{@icu _usage_}
174 * A family of character subsets representing the character blocks in the
175 * Unicode specification, generated from Unicode Data file Blocks.txt.
176 * Character blocks generally define characters used for a specific script
177 * or purpose. A character is contained by at most one Unicode block.
179 * {@icunote} All fields named XXX_ID are specific to ICU.
183 public static final class UnicodeBlock extends Character.Subset
185 // block id corresponding to icu4c -----------------------------------
190 public static final int INVALID_CODE_ID = -1;
194 public static final int BASIC_LATIN_ID = 1;
198 public static final int LATIN_1_SUPPLEMENT_ID = 2;
202 public static final int LATIN_EXTENDED_A_ID = 3;
206 public static final int LATIN_EXTENDED_B_ID = 4;
210 public static final int IPA_EXTENSIONS_ID = 5;
214 public static final int SPACING_MODIFIER_LETTERS_ID = 6;
218 public static final int COMBINING_DIACRITICAL_MARKS_ID = 7;
220 * Unicode 3.2 renames this block to "Greek and Coptic".
223 public static final int GREEK_ID = 8;
227 public static final int CYRILLIC_ID = 9;
231 public static final int ARMENIAN_ID = 10;
235 public static final int HEBREW_ID = 11;
239 public static final int ARABIC_ID = 12;
243 public static final int SYRIAC_ID = 13;
247 public static final int THAANA_ID = 14;
251 public static final int DEVANAGARI_ID = 15;
255 public static final int BENGALI_ID = 16;
259 public static final int GURMUKHI_ID = 17;
263 public static final int GUJARATI_ID = 18;
267 public static final int ORIYA_ID = 19;
271 public static final int TAMIL_ID = 20;
275 public static final int TELUGU_ID = 21;
279 public static final int KANNADA_ID = 22;
283 public static final int MALAYALAM_ID = 23;
287 public static final int SINHALA_ID = 24;
291 public static final int THAI_ID = 25;
295 public static final int LAO_ID = 26;
299 public static final int TIBETAN_ID = 27;
303 public static final int MYANMAR_ID = 28;
307 public static final int GEORGIAN_ID = 29;
311 public static final int HANGUL_JAMO_ID = 30;
315 public static final int ETHIOPIC_ID = 31;
319 public static final int CHEROKEE_ID = 32;
323 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID = 33;
327 public static final int OGHAM_ID = 34;
331 public static final int RUNIC_ID = 35;
335 public static final int KHMER_ID = 36;
339 public static final int MONGOLIAN_ID = 37;
343 public static final int LATIN_EXTENDED_ADDITIONAL_ID = 38;
347 public static final int GREEK_EXTENDED_ID = 39;
351 public static final int GENERAL_PUNCTUATION_ID = 40;
355 public static final int SUPERSCRIPTS_AND_SUBSCRIPTS_ID = 41;
359 public static final int CURRENCY_SYMBOLS_ID = 42;
361 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
365 public static final int COMBINING_MARKS_FOR_SYMBOLS_ID = 43;
369 public static final int LETTERLIKE_SYMBOLS_ID = 44;
373 public static final int NUMBER_FORMS_ID = 45;
377 public static final int ARROWS_ID = 46;
381 public static final int MATHEMATICAL_OPERATORS_ID = 47;
385 public static final int MISCELLANEOUS_TECHNICAL_ID = 48;
389 public static final int CONTROL_PICTURES_ID = 49;
393 public static final int OPTICAL_CHARACTER_RECOGNITION_ID = 50;
397 public static final int ENCLOSED_ALPHANUMERICS_ID = 51;
401 public static final int BOX_DRAWING_ID = 52;
405 public static final int BLOCK_ELEMENTS_ID = 53;
409 public static final int GEOMETRIC_SHAPES_ID = 54;
413 public static final int MISCELLANEOUS_SYMBOLS_ID = 55;
417 public static final int DINGBATS_ID = 56;
421 public static final int BRAILLE_PATTERNS_ID = 57;
425 public static final int CJK_RADICALS_SUPPLEMENT_ID = 58;
429 public static final int KANGXI_RADICALS_ID = 59;
433 public static final int IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID = 60;
437 public static final int CJK_SYMBOLS_AND_PUNCTUATION_ID = 61;
441 public static final int HIRAGANA_ID = 62;
445 public static final int KATAKANA_ID = 63;
449 public static final int BOPOMOFO_ID = 64;
453 public static final int HANGUL_COMPATIBILITY_JAMO_ID = 65;
457 public static final int KANBUN_ID = 66;
461 public static final int BOPOMOFO_EXTENDED_ID = 67;
465 public static final int ENCLOSED_CJK_LETTERS_AND_MONTHS_ID = 68;
469 public static final int CJK_COMPATIBILITY_ID = 69;
473 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID = 70;
477 public static final int CJK_UNIFIED_IDEOGRAPHS_ID = 71;
481 public static final int YI_SYLLABLES_ID = 72;
485 public static final int YI_RADICALS_ID = 73;
489 public static final int HANGUL_SYLLABLES_ID = 74;
493 public static final int HIGH_SURROGATES_ID = 75;
497 public static final int HIGH_PRIVATE_USE_SURROGATES_ID = 76;
501 public static final int LOW_SURROGATES_ID = 77;
503 * Same as public static final int PRIVATE_USE.
504 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
505 * and multiple code point ranges had this block.
506 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
507 * and adds separate blocks for the supplementary PUAs.
510 public static final int PRIVATE_USE_AREA_ID = 78;
512 * Same as public static final int PRIVATE_USE_AREA.
513 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
514 * and multiple code point ranges had this block.
515 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
516 * and adds separate blocks for the supplementary PUAs.
519 public static final int PRIVATE_USE_ID = PRIVATE_USE_AREA_ID;
523 public static final int CJK_COMPATIBILITY_IDEOGRAPHS_ID = 79;
527 public static final int ALPHABETIC_PRESENTATION_FORMS_ID = 80;
531 public static final int ARABIC_PRESENTATION_FORMS_A_ID = 81;
535 public static final int COMBINING_HALF_MARKS_ID = 82;
539 public static final int CJK_COMPATIBILITY_FORMS_ID = 83;
543 public static final int SMALL_FORM_VARIANTS_ID = 84;
547 public static final int ARABIC_PRESENTATION_FORMS_B_ID = 85;
551 public static final int SPECIALS_ID = 86;
555 public static final int HALFWIDTH_AND_FULLWIDTH_FORMS_ID = 87;
559 public static final int OLD_ITALIC_ID = 88;
563 public static final int GOTHIC_ID = 89;
567 public static final int DESERET_ID = 90;
571 public static final int BYZANTINE_MUSICAL_SYMBOLS_ID = 91;
575 public static final int MUSICAL_SYMBOLS_ID = 92;
579 public static final int MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID = 93;
583 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID = 94;
587 public static final int
588 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID = 95;
592 public static final int TAGS_ID = 96;
594 // New blocks in Unicode 3.2
597 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
600 public static final int CYRILLIC_SUPPLEMENTARY_ID = 97;
602 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
606 public static final int CYRILLIC_SUPPLEMENT_ID = 97;
610 public static final int TAGALOG_ID = 98;
614 public static final int HANUNOO_ID = 99;
618 public static final int BUHID_ID = 100;
622 public static final int TAGBANWA_ID = 101;
626 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID = 102;
630 public static final int SUPPLEMENTAL_ARROWS_A_ID = 103;
634 public static final int SUPPLEMENTAL_ARROWS_B_ID = 104;
638 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID = 105;
642 public static final int SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID = 106;
646 public static final int KATAKANA_PHONETIC_EXTENSIONS_ID = 107;
650 public static final int VARIATION_SELECTORS_ID = 108;
654 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID = 109;
658 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID = 110;
663 public static final int LIMBU_ID = 111; /*[1900]*/
667 public static final int TAI_LE_ID = 112; /*[1950]*/
671 public static final int KHMER_SYMBOLS_ID = 113; /*[19E0]*/
675 public static final int PHONETIC_EXTENSIONS_ID = 114; /*[1D00]*/
679 public static final int MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID = 115; /*[2B00]*/
683 public static final int YIJING_HEXAGRAM_SYMBOLS_ID = 116; /*[4DC0]*/
687 public static final int LINEAR_B_SYLLABARY_ID = 117; /*[10000]*/
691 public static final int LINEAR_B_IDEOGRAMS_ID = 118; /*[10080]*/
695 public static final int AEGEAN_NUMBERS_ID = 119; /*[10100]*/
699 public static final int UGARITIC_ID = 120; /*[10380]*/
703 public static final int SHAVIAN_ID = 121; /*[10450]*/
707 public static final int OSMANYA_ID = 122; /*[10480]*/
711 public static final int CYPRIOT_SYLLABARY_ID = 123; /*[10800]*/
715 public static final int TAI_XUAN_JING_SYMBOLS_ID = 124; /*[1D300]*/
719 public static final int VARIATION_SELECTORS_SUPPLEMENT_ID = 125; /*[E0100]*/
721 /* New blocks in Unicode 4.1 */
726 public static final int ANCIENT_GREEK_MUSICAL_NOTATION_ID = 126; /*[1D200]*/
731 public static final int ANCIENT_GREEK_NUMBERS_ID = 127; /*[10140]*/
736 public static final int ARABIC_SUPPLEMENT_ID = 128; /*[0750]*/
741 public static final int BUGINESE_ID = 129; /*[1A00]*/
746 public static final int CJK_STROKES_ID = 130; /*[31C0]*/
751 public static final int COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID = 131; /*[1DC0]*/
756 public static final int COPTIC_ID = 132; /*[2C80]*/
761 public static final int ETHIOPIC_EXTENDED_ID = 133; /*[2D80]*/
766 public static final int ETHIOPIC_SUPPLEMENT_ID = 134; /*[1380]*/
771 public static final int GEORGIAN_SUPPLEMENT_ID = 135; /*[2D00]*/
776 public static final int GLAGOLITIC_ID = 136; /*[2C00]*/
781 public static final int KHAROSHTHI_ID = 137; /*[10A00]*/
786 public static final int MODIFIER_TONE_LETTERS_ID = 138; /*[A700]*/
791 public static final int NEW_TAI_LUE_ID = 139; /*[1980]*/
796 public static final int OLD_PERSIAN_ID = 140; /*[103A0]*/
801 public static final int PHONETIC_EXTENSIONS_SUPPLEMENT_ID = 141; /*[1D80]*/
806 public static final int SUPPLEMENTAL_PUNCTUATION_ID = 142; /*[2E00]*/
811 public static final int SYLOTI_NAGRI_ID = 143; /*[A800]*/
816 public static final int TIFINAGH_ID = 144; /*[2D30]*/
821 public static final int VERTICAL_FORMS_ID = 145; /*[FE10]*/
823 /* New blocks in Unicode 5.0 */
828 public static final int NKO_ID = 146; /*[07C0]*/
832 public static final int BALINESE_ID = 147; /*[1B00]*/
836 public static final int LATIN_EXTENDED_C_ID = 148; /*[2C60]*/
840 public static final int LATIN_EXTENDED_D_ID = 149; /*[A720]*/
844 public static final int PHAGS_PA_ID = 150; /*[A840]*/
848 public static final int PHOENICIAN_ID = 151; /*[10900]*/
852 public static final int CUNEIFORM_ID = 152; /*[12000]*/
856 public static final int CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID = 153; /*[12400]*/
860 public static final int COUNTING_ROD_NUMERALS_ID = 154; /*[1D360]*/
865 public static final int SUNDANESE_ID = 155; /* [1B80] */
870 public static final int LEPCHA_ID = 156; /* [1C00] */
875 public static final int OL_CHIKI_ID = 157; /* [1C50] */
880 public static final int CYRILLIC_EXTENDED_A_ID = 158; /* [2DE0] */
885 public static final int VAI_ID = 159; /* [A500] */
890 public static final int CYRILLIC_EXTENDED_B_ID = 160; /* [A640] */
895 public static final int SAURASHTRA_ID = 161; /* [A880] */
900 public static final int KAYAH_LI_ID = 162; /* [A900] */
905 public static final int REJANG_ID = 163; /* [A930] */
910 public static final int CHAM_ID = 164; /* [AA00] */
915 public static final int ANCIENT_SYMBOLS_ID = 165; /* [10190] */
920 public static final int PHAISTOS_DISC_ID = 166; /* [101D0] */
925 public static final int LYCIAN_ID = 167; /* [10280] */
930 public static final int CARIAN_ID = 168; /* [102A0] */
935 public static final int LYDIAN_ID = 169; /* [10920] */
940 public static final int MAHJONG_TILES_ID = 170; /* [1F000] */
945 public static final int DOMINO_TILES_ID = 171; /* [1F030] */
947 /* New blocks in Unicode 5.2 */
949 /** @stable ICU 4.4 */
950 public static final int SAMARITAN_ID = 172; /*[0800]*/
951 /** @stable ICU 4.4 */
952 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_ID = 173; /*[18B0]*/
953 /** @stable ICU 4.4 */
954 public static final int TAI_THAM_ID = 174; /*[1A20]*/
955 /** @stable ICU 4.4 */
956 public static final int VEDIC_EXTENSIONS_ID = 175; /*[1CD0]*/
957 /** @stable ICU 4.4 */
958 public static final int LISU_ID = 176; /*[A4D0]*/
959 /** @stable ICU 4.4 */
960 public static final int BAMUM_ID = 177; /*[A6A0]*/
961 /** @stable ICU 4.4 */
962 public static final int COMMON_INDIC_NUMBER_FORMS_ID = 178; /*[A830]*/
963 /** @stable ICU 4.4 */
964 public static final int DEVANAGARI_EXTENDED_ID = 179; /*[A8E0]*/
965 /** @stable ICU 4.4 */
966 public static final int HANGUL_JAMO_EXTENDED_A_ID = 180; /*[A960]*/
967 /** @stable ICU 4.4 */
968 public static final int JAVANESE_ID = 181; /*[A980]*/
969 /** @stable ICU 4.4 */
970 public static final int MYANMAR_EXTENDED_A_ID = 182; /*[AA60]*/
971 /** @stable ICU 4.4 */
972 public static final int TAI_VIET_ID = 183; /*[AA80]*/
973 /** @stable ICU 4.4 */
974 public static final int MEETEI_MAYEK_ID = 184; /*[ABC0]*/
975 /** @stable ICU 4.4 */
976 public static final int HANGUL_JAMO_EXTENDED_B_ID = 185; /*[D7B0]*/
977 /** @stable ICU 4.4 */
978 public static final int IMPERIAL_ARAMAIC_ID = 186; /*[10840]*/
979 /** @stable ICU 4.4 */
980 public static final int OLD_SOUTH_ARABIAN_ID = 187; /*[10A60]*/
981 /** @stable ICU 4.4 */
982 public static final int AVESTAN_ID = 188; /*[10B00]*/
983 /** @stable ICU 4.4 */
984 public static final int INSCRIPTIONAL_PARTHIAN_ID = 189; /*[10B40]*/
985 /** @stable ICU 4.4 */
986 public static final int INSCRIPTIONAL_PAHLAVI_ID = 190; /*[10B60]*/
987 /** @stable ICU 4.4 */
988 public static final int OLD_TURKIC_ID = 191; /*[10C00]*/
989 /** @stable ICU 4.4 */
990 public static final int RUMI_NUMERAL_SYMBOLS_ID = 192; /*[10E60]*/
991 /** @stable ICU 4.4 */
992 public static final int KAITHI_ID = 193; /*[11080]*/
993 /** @stable ICU 4.4 */
994 public static final int EGYPTIAN_HIEROGLYPHS_ID = 194; /*[13000]*/
995 /** @stable ICU 4.4 */
996 public static final int ENCLOSED_ALPHANUMERIC_SUPPLEMENT_ID = 195; /*[1F100]*/
997 /** @stable ICU 4.4 */
998 public static final int ENCLOSED_IDEOGRAPHIC_SUPPLEMENT_ID = 196; /*[1F200]*/
999 /** @stable ICU 4.4 */
1000 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_ID = 197; /*[2A700]*/
1002 /* New blocks in Unicode 6.0 */
1004 /** @stable ICU 4.6 */
1005 public static final int MANDAIC_ID = 198; /*[0840]*/
1006 /** @stable ICU 4.6 */
1007 public static final int BATAK_ID = 199; /*[1BC0]*/
1008 /** @stable ICU 4.6 */
1009 public static final int ETHIOPIC_EXTENDED_A_ID = 200; /*[AB00]*/
1010 /** @stable ICU 4.6 */
1011 public static final int BRAHMI_ID = 201; /*[11000]*/
1012 /** @stable ICU 4.6 */
1013 public static final int BAMUM_SUPPLEMENT_ID = 202; /*[16800]*/
1014 /** @stable ICU 4.6 */
1015 public static final int KANA_SUPPLEMENT_ID = 203; /*[1B000]*/
1016 /** @stable ICU 4.6 */
1017 public static final int PLAYING_CARDS_ID = 204; /*[1F0A0]*/
1018 /** @stable ICU 4.6 */
1019 public static final int MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS_ID = 205; /*[1F300]*/
1020 /** @stable ICU 4.6 */
1021 public static final int EMOTICONS_ID = 206; /*[1F600]*/
1022 /** @stable ICU 4.6 */
1023 public static final int TRANSPORT_AND_MAP_SYMBOLS_ID = 207; /*[1F680]*/
1024 /** @stable ICU 4.6 */
1025 public static final int ALCHEMICAL_SYMBOLS_ID = 208; /*[1F700]*/
1026 /** @stable ICU 4.6 */
1027 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_ID = 209; /*[2B740]*/
1032 public static final int COUNT = 210;
1034 // blocks objects ---------------------------------------------------
1037 * Array of UnicodeBlocks, for easy access in getInstance(int)
1039 private final static UnicodeBlock BLOCKS_[] = new UnicodeBlock[COUNT];
1044 public static final UnicodeBlock NO_BLOCK
1045 = new UnicodeBlock("NO_BLOCK", 0);
1050 public static final UnicodeBlock BASIC_LATIN
1051 = new UnicodeBlock("BASIC_LATIN", BASIC_LATIN_ID);
1055 public static final UnicodeBlock LATIN_1_SUPPLEMENT
1056 = new UnicodeBlock("LATIN_1_SUPPLEMENT", LATIN_1_SUPPLEMENT_ID);
1060 public static final UnicodeBlock LATIN_EXTENDED_A
1061 = new UnicodeBlock("LATIN_EXTENDED_A", LATIN_EXTENDED_A_ID);
1065 public static final UnicodeBlock LATIN_EXTENDED_B
1066 = new UnicodeBlock("LATIN_EXTENDED_B", LATIN_EXTENDED_B_ID);
1070 public static final UnicodeBlock IPA_EXTENSIONS
1071 = new UnicodeBlock("IPA_EXTENSIONS", IPA_EXTENSIONS_ID);
1075 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
1076 = new UnicodeBlock("SPACING_MODIFIER_LETTERS", SPACING_MODIFIER_LETTERS_ID);
1080 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
1081 = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS", COMBINING_DIACRITICAL_MARKS_ID);
1083 * Unicode 3.2 renames this block to "Greek and Coptic".
1086 public static final UnicodeBlock GREEK
1087 = new UnicodeBlock("GREEK", GREEK_ID);
1091 public static final UnicodeBlock CYRILLIC
1092 = new UnicodeBlock("CYRILLIC", CYRILLIC_ID);
1096 public static final UnicodeBlock ARMENIAN
1097 = new UnicodeBlock("ARMENIAN", ARMENIAN_ID);
1101 public static final UnicodeBlock HEBREW
1102 = new UnicodeBlock("HEBREW", HEBREW_ID);
1106 public static final UnicodeBlock ARABIC
1107 = new UnicodeBlock("ARABIC", ARABIC_ID);
1111 public static final UnicodeBlock SYRIAC
1112 = new UnicodeBlock("SYRIAC", SYRIAC_ID);
1116 public static final UnicodeBlock THAANA
1117 = new UnicodeBlock("THAANA", THAANA_ID);
1121 public static final UnicodeBlock DEVANAGARI
1122 = new UnicodeBlock("DEVANAGARI", DEVANAGARI_ID);
1126 public static final UnicodeBlock BENGALI
1127 = new UnicodeBlock("BENGALI", BENGALI_ID);
1131 public static final UnicodeBlock GURMUKHI
1132 = new UnicodeBlock("GURMUKHI", GURMUKHI_ID);
1136 public static final UnicodeBlock GUJARATI
1137 = new UnicodeBlock("GUJARATI", GUJARATI_ID);
1141 public static final UnicodeBlock ORIYA
1142 = new UnicodeBlock("ORIYA", ORIYA_ID);
1146 public static final UnicodeBlock TAMIL
1147 = new UnicodeBlock("TAMIL", TAMIL_ID);
1151 public static final UnicodeBlock TELUGU
1152 = new UnicodeBlock("TELUGU", TELUGU_ID);
1156 public static final UnicodeBlock KANNADA
1157 = new UnicodeBlock("KANNADA", KANNADA_ID);
1161 public static final UnicodeBlock MALAYALAM
1162 = new UnicodeBlock("MALAYALAM", MALAYALAM_ID);
1166 public static final UnicodeBlock SINHALA
1167 = new UnicodeBlock("SINHALA", SINHALA_ID);
1171 public static final UnicodeBlock THAI
1172 = new UnicodeBlock("THAI", THAI_ID);
1176 public static final UnicodeBlock LAO
1177 = new UnicodeBlock("LAO", LAO_ID);
1181 public static final UnicodeBlock TIBETAN
1182 = new UnicodeBlock("TIBETAN", TIBETAN_ID);
1186 public static final UnicodeBlock MYANMAR
1187 = new UnicodeBlock("MYANMAR", MYANMAR_ID);
1191 public static final UnicodeBlock GEORGIAN
1192 = new UnicodeBlock("GEORGIAN", GEORGIAN_ID);
1196 public static final UnicodeBlock HANGUL_JAMO
1197 = new UnicodeBlock("HANGUL_JAMO", HANGUL_JAMO_ID);
1201 public static final UnicodeBlock ETHIOPIC
1202 = new UnicodeBlock("ETHIOPIC", ETHIOPIC_ID);
1206 public static final UnicodeBlock CHEROKEE
1207 = new UnicodeBlock("CHEROKEE", CHEROKEE_ID);
1211 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
1212 = new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS",
1213 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID);
1217 public static final UnicodeBlock OGHAM
1218 = new UnicodeBlock("OGHAM", OGHAM_ID);
1222 public static final UnicodeBlock RUNIC
1223 = new UnicodeBlock("RUNIC", RUNIC_ID);
1227 public static final UnicodeBlock KHMER
1228 = new UnicodeBlock("KHMER", KHMER_ID);
1232 public static final UnicodeBlock MONGOLIAN
1233 = new UnicodeBlock("MONGOLIAN", MONGOLIAN_ID);
1237 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
1238 = new UnicodeBlock("LATIN_EXTENDED_ADDITIONAL", LATIN_EXTENDED_ADDITIONAL_ID);
1242 public static final UnicodeBlock GREEK_EXTENDED
1243 = new UnicodeBlock("GREEK_EXTENDED", GREEK_EXTENDED_ID);
1247 public static final UnicodeBlock GENERAL_PUNCTUATION
1248 = new UnicodeBlock("GENERAL_PUNCTUATION", GENERAL_PUNCTUATION_ID);
1252 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
1253 = new UnicodeBlock("SUPERSCRIPTS_AND_SUBSCRIPTS", SUPERSCRIPTS_AND_SUBSCRIPTS_ID);
1257 public static final UnicodeBlock CURRENCY_SYMBOLS
1258 = new UnicodeBlock("CURRENCY_SYMBOLS", CURRENCY_SYMBOLS_ID);
1260 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
1264 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
1265 = new UnicodeBlock("COMBINING_MARKS_FOR_SYMBOLS", COMBINING_MARKS_FOR_SYMBOLS_ID);
1269 public static final UnicodeBlock LETTERLIKE_SYMBOLS
1270 = new UnicodeBlock("LETTERLIKE_SYMBOLS", LETTERLIKE_SYMBOLS_ID);
1274 public static final UnicodeBlock NUMBER_FORMS
1275 = new UnicodeBlock("NUMBER_FORMS", NUMBER_FORMS_ID);
1279 public static final UnicodeBlock ARROWS
1280 = new UnicodeBlock("ARROWS", ARROWS_ID);
1284 public static final UnicodeBlock MATHEMATICAL_OPERATORS
1285 = new UnicodeBlock("MATHEMATICAL_OPERATORS", MATHEMATICAL_OPERATORS_ID);
1289 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
1290 = new UnicodeBlock("MISCELLANEOUS_TECHNICAL", MISCELLANEOUS_TECHNICAL_ID);
1294 public static final UnicodeBlock CONTROL_PICTURES
1295 = new UnicodeBlock("CONTROL_PICTURES", CONTROL_PICTURES_ID);
1299 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
1300 = new UnicodeBlock("OPTICAL_CHARACTER_RECOGNITION", OPTICAL_CHARACTER_RECOGNITION_ID);
1304 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
1305 = new UnicodeBlock("ENCLOSED_ALPHANUMERICS", ENCLOSED_ALPHANUMERICS_ID);
1309 public static final UnicodeBlock BOX_DRAWING
1310 = new UnicodeBlock("BOX_DRAWING", BOX_DRAWING_ID);
1314 public static final UnicodeBlock BLOCK_ELEMENTS
1315 = new UnicodeBlock("BLOCK_ELEMENTS", BLOCK_ELEMENTS_ID);
1319 public static final UnicodeBlock GEOMETRIC_SHAPES
1320 = new UnicodeBlock("GEOMETRIC_SHAPES", GEOMETRIC_SHAPES_ID);
1324 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
1325 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS", MISCELLANEOUS_SYMBOLS_ID);
1329 public static final UnicodeBlock DINGBATS
1330 = new UnicodeBlock("DINGBATS", DINGBATS_ID);
1334 public static final UnicodeBlock BRAILLE_PATTERNS
1335 = new UnicodeBlock("BRAILLE_PATTERNS", BRAILLE_PATTERNS_ID);
1339 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
1340 = new UnicodeBlock("CJK_RADICALS_SUPPLEMENT", CJK_RADICALS_SUPPLEMENT_ID);
1344 public static final UnicodeBlock KANGXI_RADICALS
1345 = new UnicodeBlock("KANGXI_RADICALS", KANGXI_RADICALS_ID);
1349 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1350 = new UnicodeBlock("IDEOGRAPHIC_DESCRIPTION_CHARACTERS",
1351 IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID);
1355 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1356 = new UnicodeBlock("CJK_SYMBOLS_AND_PUNCTUATION", CJK_SYMBOLS_AND_PUNCTUATION_ID);
1360 public static final UnicodeBlock HIRAGANA
1361 = new UnicodeBlock("HIRAGANA", HIRAGANA_ID);
1365 public static final UnicodeBlock KATAKANA
1366 = new UnicodeBlock("KATAKANA", KATAKANA_ID);
1370 public static final UnicodeBlock BOPOMOFO
1371 = new UnicodeBlock("BOPOMOFO", BOPOMOFO_ID);
1375 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1376 = new UnicodeBlock("HANGUL_COMPATIBILITY_JAMO", HANGUL_COMPATIBILITY_JAMO_ID);
1380 public static final UnicodeBlock KANBUN
1381 = new UnicodeBlock("KANBUN", KANBUN_ID);
1385 public static final UnicodeBlock BOPOMOFO_EXTENDED
1386 = new UnicodeBlock("BOPOMOFO_EXTENDED", BOPOMOFO_EXTENDED_ID);
1390 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1391 = new UnicodeBlock("ENCLOSED_CJK_LETTERS_AND_MONTHS",
1392 ENCLOSED_CJK_LETTERS_AND_MONTHS_ID);
1396 public static final UnicodeBlock CJK_COMPATIBILITY
1397 = new UnicodeBlock("CJK_COMPATIBILITY", CJK_COMPATIBILITY_ID);
1401 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1402 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A",
1403 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID);
1407 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1408 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS", CJK_UNIFIED_IDEOGRAPHS_ID);
1412 public static final UnicodeBlock YI_SYLLABLES
1413 = new UnicodeBlock("YI_SYLLABLES", YI_SYLLABLES_ID);
1417 public static final UnicodeBlock YI_RADICALS
1418 = new UnicodeBlock("YI_RADICALS", YI_RADICALS_ID);
1422 public static final UnicodeBlock HANGUL_SYLLABLES
1423 = new UnicodeBlock("HANGUL_SYLLABLES", HANGUL_SYLLABLES_ID);
1427 public static final UnicodeBlock HIGH_SURROGATES
1428 = new UnicodeBlock("HIGH_SURROGATES", HIGH_SURROGATES_ID);
1432 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1433 = new UnicodeBlock("HIGH_PRIVATE_USE_SURROGATES", HIGH_PRIVATE_USE_SURROGATES_ID);
1437 public static final UnicodeBlock LOW_SURROGATES
1438 = new UnicodeBlock("LOW_SURROGATES", LOW_SURROGATES_ID);
1440 * Same as public static final int PRIVATE_USE.
1441 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1442 * and multiple code point ranges had this block.
1443 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1444 * and adds separate blocks for the supplementary PUAs.
1447 public static final UnicodeBlock PRIVATE_USE_AREA
1448 = new UnicodeBlock("PRIVATE_USE_AREA", 78);
1450 * Same as public static final int PRIVATE_USE_AREA.
1451 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1452 * and multiple code point ranges had this block.
1453 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1454 * and adds separate blocks for the supplementary PUAs.
1457 public static final UnicodeBlock PRIVATE_USE
1462 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1463 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS", CJK_COMPATIBILITY_IDEOGRAPHS_ID);
1467 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1468 = new UnicodeBlock("ALPHABETIC_PRESENTATION_FORMS", ALPHABETIC_PRESENTATION_FORMS_ID);
1472 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1473 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_A", ARABIC_PRESENTATION_FORMS_A_ID);
1477 public static final UnicodeBlock COMBINING_HALF_MARKS
1478 = new UnicodeBlock("COMBINING_HALF_MARKS", COMBINING_HALF_MARKS_ID);
1482 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1483 = new UnicodeBlock("CJK_COMPATIBILITY_FORMS", CJK_COMPATIBILITY_FORMS_ID);
1487 public static final UnicodeBlock SMALL_FORM_VARIANTS
1488 = new UnicodeBlock("SMALL_FORM_VARIANTS", SMALL_FORM_VARIANTS_ID);
1492 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1493 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_B", ARABIC_PRESENTATION_FORMS_B_ID);
1497 public static final UnicodeBlock SPECIALS
1498 = new UnicodeBlock("SPECIALS", SPECIALS_ID);
1502 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1503 = new UnicodeBlock("HALFWIDTH_AND_FULLWIDTH_FORMS", HALFWIDTH_AND_FULLWIDTH_FORMS_ID);
1507 public static final UnicodeBlock OLD_ITALIC
1508 = new UnicodeBlock("OLD_ITALIC", OLD_ITALIC_ID);
1512 public static final UnicodeBlock GOTHIC
1513 = new UnicodeBlock("GOTHIC", GOTHIC_ID);
1517 public static final UnicodeBlock DESERET
1518 = new UnicodeBlock("DESERET", DESERET_ID);
1522 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1523 = new UnicodeBlock("BYZANTINE_MUSICAL_SYMBOLS", BYZANTINE_MUSICAL_SYMBOLS_ID);
1527 public static final UnicodeBlock MUSICAL_SYMBOLS
1528 = new UnicodeBlock("MUSICAL_SYMBOLS", MUSICAL_SYMBOLS_ID);
1532 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1533 = new UnicodeBlock("MATHEMATICAL_ALPHANUMERIC_SYMBOLS",
1534 MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID);
1538 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1539 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B",
1540 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID);
1544 public static final UnicodeBlock
1545 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1546 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT",
1547 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID);
1551 public static final UnicodeBlock TAGS
1552 = new UnicodeBlock("TAGS", TAGS_ID);
1554 // New blocks in Unicode 3.2
1557 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1560 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
1561 = new UnicodeBlock("CYRILLIC_SUPPLEMENTARY", CYRILLIC_SUPPLEMENTARY_ID);
1563 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1566 public static final UnicodeBlock CYRILLIC_SUPPLEMENT
1567 = new UnicodeBlock("CYRILLIC_SUPPLEMENT", CYRILLIC_SUPPLEMENT_ID);
1571 public static final UnicodeBlock TAGALOG
1572 = new UnicodeBlock("TAGALOG", TAGALOG_ID);
1576 public static final UnicodeBlock HANUNOO
1577 = new UnicodeBlock("HANUNOO", HANUNOO_ID);
1581 public static final UnicodeBlock BUHID
1582 = new UnicodeBlock("BUHID", BUHID_ID);
1586 public static final UnicodeBlock TAGBANWA
1587 = new UnicodeBlock("TAGBANWA", TAGBANWA_ID);
1591 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
1592 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A",
1593 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID);
1597 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
1598 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_A", SUPPLEMENTAL_ARROWS_A_ID);
1602 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
1603 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_B", SUPPLEMENTAL_ARROWS_B_ID);
1607 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
1608 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B",
1609 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID);
1613 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
1614 = new UnicodeBlock("SUPPLEMENTAL_MATHEMATICAL_OPERATORS",
1615 SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID);
1619 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1620 = new UnicodeBlock("KATAKANA_PHONETIC_EXTENSIONS", KATAKANA_PHONETIC_EXTENSIONS_ID);
1624 public static final UnicodeBlock VARIATION_SELECTORS
1625 = new UnicodeBlock("VARIATION_SELECTORS", VARIATION_SELECTORS_ID);
1629 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1630 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_A",
1631 SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID);
1635 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1636 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_B",
1637 SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID);
1642 public static final UnicodeBlock LIMBU
1643 = new UnicodeBlock("LIMBU", LIMBU_ID);
1647 public static final UnicodeBlock TAI_LE
1648 = new UnicodeBlock("TAI_LE", TAI_LE_ID);
1652 public static final UnicodeBlock KHMER_SYMBOLS
1653 = new UnicodeBlock("KHMER_SYMBOLS", KHMER_SYMBOLS_ID);
1658 public static final UnicodeBlock PHONETIC_EXTENSIONS
1659 = new UnicodeBlock("PHONETIC_EXTENSIONS", PHONETIC_EXTENSIONS_ID);
1664 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
1665 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS_AND_ARROWS",
1666 MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID);
1670 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1671 = new UnicodeBlock("YIJING_HEXAGRAM_SYMBOLS", YIJING_HEXAGRAM_SYMBOLS_ID);
1675 public static final UnicodeBlock LINEAR_B_SYLLABARY
1676 = new UnicodeBlock("LINEAR_B_SYLLABARY", LINEAR_B_SYLLABARY_ID);
1680 public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1681 = new UnicodeBlock("LINEAR_B_IDEOGRAMS", LINEAR_B_IDEOGRAMS_ID);
1685 public static final UnicodeBlock AEGEAN_NUMBERS
1686 = new UnicodeBlock("AEGEAN_NUMBERS", AEGEAN_NUMBERS_ID);
1690 public static final UnicodeBlock UGARITIC
1691 = new UnicodeBlock("UGARITIC", UGARITIC_ID);
1695 public static final UnicodeBlock SHAVIAN
1696 = new UnicodeBlock("SHAVIAN", SHAVIAN_ID);
1700 public static final UnicodeBlock OSMANYA
1701 = new UnicodeBlock("OSMANYA", OSMANYA_ID);
1705 public static final UnicodeBlock CYPRIOT_SYLLABARY
1706 = new UnicodeBlock("CYPRIOT_SYLLABARY", CYPRIOT_SYLLABARY_ID);
1710 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1711 = new UnicodeBlock("TAI_XUAN_JING_SYMBOLS", TAI_XUAN_JING_SYMBOLS_ID);
1716 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1717 = new UnicodeBlock("VARIATION_SELECTORS_SUPPLEMENT", VARIATION_SELECTORS_SUPPLEMENT_ID);
1719 /* New blocks in Unicode 4.1 */
1724 public static final UnicodeBlock ANCIENT_GREEK_MUSICAL_NOTATION =
1725 new UnicodeBlock("ANCIENT_GREEK_MUSICAL_NOTATION",
1726 ANCIENT_GREEK_MUSICAL_NOTATION_ID); /*[1D200]*/
1731 public static final UnicodeBlock ANCIENT_GREEK_NUMBERS =
1732 new UnicodeBlock("ANCIENT_GREEK_NUMBERS", ANCIENT_GREEK_NUMBERS_ID); /*[10140]*/
1737 public static final UnicodeBlock ARABIC_SUPPLEMENT =
1738 new UnicodeBlock("ARABIC_SUPPLEMENT", ARABIC_SUPPLEMENT_ID); /*[0750]*/
1743 public static final UnicodeBlock BUGINESE =
1744 new UnicodeBlock("BUGINESE", BUGINESE_ID); /*[1A00]*/
1749 public static final UnicodeBlock CJK_STROKES =
1750 new UnicodeBlock("CJK_STROKES", CJK_STROKES_ID); /*[31C0]*/
1755 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_SUPPLEMENT =
1756 new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_SUPPLEMENT",
1757 COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID); /*[1DC0]*/
1762 public static final UnicodeBlock COPTIC = new UnicodeBlock("COPTIC", COPTIC_ID); /*[2C80]*/
1767 public static final UnicodeBlock ETHIOPIC_EXTENDED =
1768 new UnicodeBlock("ETHIOPIC_EXTENDED", ETHIOPIC_EXTENDED_ID); /*[2D80]*/
1773 public static final UnicodeBlock ETHIOPIC_SUPPLEMENT =
1774 new UnicodeBlock("ETHIOPIC_SUPPLEMENT", ETHIOPIC_SUPPLEMENT_ID); /*[1380]*/
1779 public static final UnicodeBlock GEORGIAN_SUPPLEMENT =
1780 new UnicodeBlock("GEORGIAN_SUPPLEMENT", GEORGIAN_SUPPLEMENT_ID); /*[2D00]*/
1785 public static final UnicodeBlock GLAGOLITIC =
1786 new UnicodeBlock("GLAGOLITIC", GLAGOLITIC_ID); /*[2C00]*/
1791 public static final UnicodeBlock KHAROSHTHI =
1792 new UnicodeBlock("KHAROSHTHI", KHAROSHTHI_ID); /*[10A00]*/
1797 public static final UnicodeBlock MODIFIER_TONE_LETTERS =
1798 new UnicodeBlock("MODIFIER_TONE_LETTERS", MODIFIER_TONE_LETTERS_ID); /*[A700]*/
1803 public static final UnicodeBlock NEW_TAI_LUE =
1804 new UnicodeBlock("NEW_TAI_LUE", NEW_TAI_LUE_ID); /*[1980]*/
1809 public static final UnicodeBlock OLD_PERSIAN =
1810 new UnicodeBlock("OLD_PERSIAN", OLD_PERSIAN_ID); /*[103A0]*/
1815 public static final UnicodeBlock PHONETIC_EXTENSIONS_SUPPLEMENT =
1816 new UnicodeBlock("PHONETIC_EXTENSIONS_SUPPLEMENT",
1817 PHONETIC_EXTENSIONS_SUPPLEMENT_ID); /*[1D80]*/
1822 public static final UnicodeBlock SUPPLEMENTAL_PUNCTUATION =
1823 new UnicodeBlock("SUPPLEMENTAL_PUNCTUATION", SUPPLEMENTAL_PUNCTUATION_ID); /*[2E00]*/
1828 public static final UnicodeBlock SYLOTI_NAGRI =
1829 new UnicodeBlock("SYLOTI_NAGRI", SYLOTI_NAGRI_ID); /*[A800]*/
1834 public static final UnicodeBlock TIFINAGH =
1835 new UnicodeBlock("TIFINAGH", TIFINAGH_ID); /*[2D30]*/
1840 public static final UnicodeBlock VERTICAL_FORMS =
1841 new UnicodeBlock("VERTICAL_FORMS", VERTICAL_FORMS_ID); /*[FE10]*/
1846 public static final UnicodeBlock NKO = new UnicodeBlock("NKO", NKO_ID); /*[07C0]*/
1850 public static final UnicodeBlock BALINESE =
1851 new UnicodeBlock("BALINESE", BALINESE_ID); /*[1B00]*/
1855 public static final UnicodeBlock LATIN_EXTENDED_C =
1856 new UnicodeBlock("LATIN_EXTENDED_C", LATIN_EXTENDED_C_ID); /*[2C60]*/
1860 public static final UnicodeBlock LATIN_EXTENDED_D =
1861 new UnicodeBlock("LATIN_EXTENDED_D", LATIN_EXTENDED_D_ID); /*[A720]*/
1865 public static final UnicodeBlock PHAGS_PA =
1866 new UnicodeBlock("PHAGS_PA", PHAGS_PA_ID); /*[A840]*/
1870 public static final UnicodeBlock PHOENICIAN =
1871 new UnicodeBlock("PHOENICIAN", PHOENICIAN_ID); /*[10900]*/
1875 public static final UnicodeBlock CUNEIFORM =
1876 new UnicodeBlock("CUNEIFORM", CUNEIFORM_ID); /*[12000]*/
1880 public static final UnicodeBlock CUNEIFORM_NUMBERS_AND_PUNCTUATION =
1881 new UnicodeBlock("CUNEIFORM_NUMBERS_AND_PUNCTUATION",
1882 CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID); /*[12400]*/
1886 public static final UnicodeBlock COUNTING_ROD_NUMERALS =
1887 new UnicodeBlock("COUNTING_ROD_NUMERALS", COUNTING_ROD_NUMERALS_ID); /*[1D360]*/
1892 public static final UnicodeBlock SUNDANESE =
1893 new UnicodeBlock("SUNDANESE", SUNDANESE_ID); /* [1B80] */
1898 public static final UnicodeBlock LEPCHA =
1899 new UnicodeBlock("LEPCHA", LEPCHA_ID); /* [1C00] */
1904 public static final UnicodeBlock OL_CHIKI =
1905 new UnicodeBlock("OL_CHIKI", OL_CHIKI_ID); /* [1C50] */
1910 public static final UnicodeBlock CYRILLIC_EXTENDED_A =
1911 new UnicodeBlock("CYRILLIC_EXTENDED_A", CYRILLIC_EXTENDED_A_ID); /* [2DE0] */
1916 public static final UnicodeBlock VAI = new UnicodeBlock("VAI", VAI_ID); /* [A500] */
1921 public static final UnicodeBlock CYRILLIC_EXTENDED_B =
1922 new UnicodeBlock("CYRILLIC_EXTENDED_B", CYRILLIC_EXTENDED_B_ID); /* [A640] */
1927 public static final UnicodeBlock SAURASHTRA =
1928 new UnicodeBlock("SAURASHTRA", SAURASHTRA_ID); /* [A880] */
1933 public static final UnicodeBlock KAYAH_LI =
1934 new UnicodeBlock("KAYAH_LI", KAYAH_LI_ID); /* [A900] */
1939 public static final UnicodeBlock REJANG =
1940 new UnicodeBlock("REJANG", REJANG_ID); /* [A930] */
1945 public static final UnicodeBlock CHAM =
1946 new UnicodeBlock("CHAM", CHAM_ID); /* [AA00] */
1951 public static final UnicodeBlock ANCIENT_SYMBOLS =
1952 new UnicodeBlock("ANCIENT_SYMBOLS", ANCIENT_SYMBOLS_ID); /* [10190] */
1957 public static final UnicodeBlock PHAISTOS_DISC =
1958 new UnicodeBlock("PHAISTOS_DISC", PHAISTOS_DISC_ID); /* [101D0] */
1963 public static final UnicodeBlock LYCIAN =
1964 new UnicodeBlock("LYCIAN", LYCIAN_ID); /* [10280] */
1969 public static final UnicodeBlock CARIAN =
1970 new UnicodeBlock("CARIAN", CARIAN_ID); /* [102A0] */
1975 public static final UnicodeBlock LYDIAN =
1976 new UnicodeBlock("LYDIAN", LYDIAN_ID); /* [10920] */
1981 public static final UnicodeBlock MAHJONG_TILES =
1982 new UnicodeBlock("MAHJONG_TILES", MAHJONG_TILES_ID); /* [1F000] */
1987 public static final UnicodeBlock DOMINO_TILES =
1988 new UnicodeBlock("DOMINO_TILES", DOMINO_TILES_ID); /* [1F030] */
1990 /* New blocks in Unicode 5.2 */
1992 /** @stable ICU 4.4 */
1993 public static final UnicodeBlock SAMARITAN =
1994 new UnicodeBlock("SAMARITAN", SAMARITAN_ID); /*[0800]*/
1995 /** @stable ICU 4.4 */
1996 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED =
1997 new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED",
1998 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_ID); /*[18B0]*/
1999 /** @stable ICU 4.4 */
2000 public static final UnicodeBlock TAI_THAM =
2001 new UnicodeBlock("TAI_THAM", TAI_THAM_ID); /*[1A20]*/
2002 /** @stable ICU 4.4 */
2003 public static final UnicodeBlock VEDIC_EXTENSIONS =
2004 new UnicodeBlock("VEDIC_EXTENSIONS", VEDIC_EXTENSIONS_ID); /*[1CD0]*/
2005 /** @stable ICU 4.4 */
2006 public static final UnicodeBlock LISU =
2007 new UnicodeBlock("LISU", LISU_ID); /*[A4D0]*/
2008 /** @stable ICU 4.4 */
2009 public static final UnicodeBlock BAMUM =
2010 new UnicodeBlock("BAMUM", BAMUM_ID); /*[A6A0]*/
2011 /** @stable ICU 4.4 */
2012 public static final UnicodeBlock COMMON_INDIC_NUMBER_FORMS =
2013 new UnicodeBlock("COMMON_INDIC_NUMBER_FORMS", COMMON_INDIC_NUMBER_FORMS_ID); /*[A830]*/
2014 /** @stable ICU 4.4 */
2015 public static final UnicodeBlock DEVANAGARI_EXTENDED =
2016 new UnicodeBlock("DEVANAGARI_EXTENDED", DEVANAGARI_EXTENDED_ID); /*[A8E0]*/
2017 /** @stable ICU 4.4 */
2018 public static final UnicodeBlock HANGUL_JAMO_EXTENDED_A =
2019 new UnicodeBlock("HANGUL_JAMO_EXTENDED_A", HANGUL_JAMO_EXTENDED_A_ID); /*[A960]*/
2020 /** @stable ICU 4.4 */
2021 public static final UnicodeBlock JAVANESE =
2022 new UnicodeBlock("JAVANESE", JAVANESE_ID); /*[A980]*/
2023 /** @stable ICU 4.4 */
2024 public static final UnicodeBlock MYANMAR_EXTENDED_A =
2025 new UnicodeBlock("MYANMAR_EXTENDED_A", MYANMAR_EXTENDED_A_ID); /*[AA60]*/
2026 /** @stable ICU 4.4 */
2027 public static final UnicodeBlock TAI_VIET =
2028 new UnicodeBlock("TAI_VIET", TAI_VIET_ID); /*[AA80]*/
2029 /** @stable ICU 4.4 */
2030 public static final UnicodeBlock MEETEI_MAYEK =
2031 new UnicodeBlock("MEETEI_MAYEK", MEETEI_MAYEK_ID); /*[ABC0]*/
2032 /** @stable ICU 4.4 */
2033 public static final UnicodeBlock HANGUL_JAMO_EXTENDED_B =
2034 new UnicodeBlock("HANGUL_JAMO_EXTENDED_B", HANGUL_JAMO_EXTENDED_B_ID); /*[D7B0]*/
2035 /** @stable ICU 4.4 */
2036 public static final UnicodeBlock IMPERIAL_ARAMAIC =
2037 new UnicodeBlock("IMPERIAL_ARAMAIC", IMPERIAL_ARAMAIC_ID); /*[10840]*/
2038 /** @stable ICU 4.4 */
2039 public static final UnicodeBlock OLD_SOUTH_ARABIAN =
2040 new UnicodeBlock("OLD_SOUTH_ARABIAN", OLD_SOUTH_ARABIAN_ID); /*[10A60]*/
2041 /** @stable ICU 4.4 */
2042 public static final UnicodeBlock AVESTAN =
2043 new UnicodeBlock("AVESTAN", AVESTAN_ID); /*[10B00]*/
2044 /** @stable ICU 4.4 */
2045 public static final UnicodeBlock INSCRIPTIONAL_PARTHIAN =
2046 new UnicodeBlock("INSCRIPTIONAL_PARTHIAN", INSCRIPTIONAL_PARTHIAN_ID); /*[10B40]*/
2047 /** @stable ICU 4.4 */
2048 public static final UnicodeBlock INSCRIPTIONAL_PAHLAVI =
2049 new UnicodeBlock("INSCRIPTIONAL_PAHLAVI", INSCRIPTIONAL_PAHLAVI_ID); /*[10B60]*/
2050 /** @stable ICU 4.4 */
2051 public static final UnicodeBlock OLD_TURKIC =
2052 new UnicodeBlock("OLD_TURKIC", OLD_TURKIC_ID); /*[10C00]*/
2053 /** @stable ICU 4.4 */
2054 public static final UnicodeBlock RUMI_NUMERAL_SYMBOLS =
2055 new UnicodeBlock("RUMI_NUMERAL_SYMBOLS", RUMI_NUMERAL_SYMBOLS_ID); /*[10E60]*/
2056 /** @stable ICU 4.4 */
2057 public static final UnicodeBlock KAITHI =
2058 new UnicodeBlock("KAITHI", KAITHI_ID); /*[11080]*/
2059 /** @stable ICU 4.4 */
2060 public static final UnicodeBlock EGYPTIAN_HIEROGLYPHS =
2061 new UnicodeBlock("EGYPTIAN_HIEROGLYPHS", EGYPTIAN_HIEROGLYPHS_ID); /*[13000]*/
2062 /** @stable ICU 4.4 */
2063 public static final UnicodeBlock ENCLOSED_ALPHANUMERIC_SUPPLEMENT =
2064 new UnicodeBlock("ENCLOSED_ALPHANUMERIC_SUPPLEMENT",
2065 ENCLOSED_ALPHANUMERIC_SUPPLEMENT_ID); /*[1F100]*/
2066 /** @stable ICU 4.4 */
2067 public static final UnicodeBlock ENCLOSED_IDEOGRAPHIC_SUPPLEMENT =
2068 new UnicodeBlock("ENCLOSED_IDEOGRAPHIC_SUPPLEMENT",
2069 ENCLOSED_IDEOGRAPHIC_SUPPLEMENT_ID); /*[1F200]*/
2070 /** @stable ICU 4.4 */
2071 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C =
2072 new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C",
2073 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C_ID); /*[2A700]*/
2075 /* New blocks in Unicode 6.0 */
2077 /** @stable ICU 4.6 */
2078 public static final UnicodeBlock MANDAIC =
2079 new UnicodeBlock("MANDAIC", MANDAIC_ID); /*[0840]*/
2080 /** @stable ICU 4.6 */
2081 public static final UnicodeBlock BATAK =
2082 new UnicodeBlock("BATAK", BATAK_ID); /*[1BC0]*/
2083 /** @stable ICU 4.6 */
2084 public static final UnicodeBlock ETHIOPIC_EXTENDED_A =
2085 new UnicodeBlock("ETHIOPIC_EXTENDED_A", ETHIOPIC_EXTENDED_A_ID); /*[AB00]*/
2086 /** @stable ICU 4.6 */
2087 public static final UnicodeBlock BRAHMI =
2088 new UnicodeBlock("BRAHMI", BRAHMI_ID); /*[11000]*/
2089 /** @stable ICU 4.6 */
2090 public static final UnicodeBlock BAMUM_SUPPLEMENT =
2091 new UnicodeBlock("BAMUM_SUPPLEMENT", BAMUM_SUPPLEMENT_ID); /*[16800]*/
2092 /** @stable ICU 4.6 */
2093 public static final UnicodeBlock KANA_SUPPLEMENT =
2094 new UnicodeBlock("KANA_SUPPLEMENT", KANA_SUPPLEMENT_ID); /*[1B000]*/
2095 /** @stable ICU 4.6 */
2096 public static final UnicodeBlock PLAYING_CARDS =
2097 new UnicodeBlock("PLAYING_CARDS", PLAYING_CARDS_ID); /*[1F0A0]*/
2098 /** @stable ICU 4.6 */
2099 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS =
2100 new UnicodeBlock("MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS",
2101 MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS_ID); /*[1F300]*/
2102 /** @stable ICU 4.6 */
2103 public static final UnicodeBlock EMOTICONS =
2104 new UnicodeBlock("EMOTICONS", EMOTICONS_ID); /*[1F600]*/
2105 /** @stable ICU 4.6 */
2106 public static final UnicodeBlock TRANSPORT_AND_MAP_SYMBOLS =
2107 new UnicodeBlock("TRANSPORT_AND_MAP_SYMBOLS", TRANSPORT_AND_MAP_SYMBOLS_ID); /*[1F680]*/
2108 /** @stable ICU 4.6 */
2109 public static final UnicodeBlock ALCHEMICAL_SYMBOLS =
2110 new UnicodeBlock("ALCHEMICAL_SYMBOLS", ALCHEMICAL_SYMBOLS_ID); /*[1F700]*/
2111 /** @stable ICU 4.6 */
2112 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D =
2113 new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D",
2114 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D_ID); /*[2B740]*/
2119 public static final UnicodeBlock INVALID_CODE
2120 = new UnicodeBlock("INVALID_CODE", INVALID_CODE_ID);
2123 for (int blockId = 0; blockId < COUNT; ++blockId) {
2124 if (BLOCKS_[blockId] == null) {
2125 throw new java.lang.IllegalStateException(
2126 "UnicodeBlock.BLOCKS_[" + blockId + "] not initialized");
2131 // public methods --------------------------------------------------
2134 * {@icu} Returns the only instance of the UnicodeBlock with the argument ID.
2135 * If no such ID exists, a INVALID_CODE UnicodeBlock will be returned.
2136 * @param id UnicodeBlock ID
2137 * @return the only instance of the UnicodeBlock with the argument ID
2138 * if it exists, otherwise a INVALID_CODE UnicodeBlock will be
2142 public static UnicodeBlock getInstance(int id)
2144 if (id >= 0 && id < BLOCKS_.length) {
2147 return INVALID_CODE;
2151 * Returns the Unicode allocation block that contains the code point,
2152 * or null if the code point is not a member of a defined block.
2153 * @param ch code point to be tested
2154 * @return the Unicode allocation block that contains the code point
2157 public static UnicodeBlock of(int ch)
2159 if (ch > MAX_VALUE) {
2160 return INVALID_CODE;
2163 return UnicodeBlock.getInstance(
2164 UCharacterProperty.INSTANCE.getIntPropertyValue(ch, UProperty.BLOCK));
2168 * Cover the JDK 1.5 API. Return the Unicode block with the
2169 * given name. {@icunote} Unlike JDK 1.5, this only matches
2170 * against the official UCD name and the Java block name
2172 * @param blockName the name of the block to match
2173 * @return the UnicodeBlock with that name
2174 * @throws IllegalArgumentException if the blockName could not be matched
2177 public static final UnicodeBlock forName(String blockName) {
2178 Map<String, UnicodeBlock> m = null;
2183 m = new HashMap<String, UnicodeBlock>(BLOCKS_.length);
2184 for (int i = 0; i < BLOCKS_.length; ++i) {
2185 UnicodeBlock b = BLOCKS_[i];
2186 String name = trimBlockName(
2187 getPropertyValueName(UProperty.BLOCK, b.getID(),
2188 UProperty.NameChoice.LONG));
2191 mref = new SoftReference<Map<String, UnicodeBlock>>(m);
2193 UnicodeBlock b = m.get(trimBlockName(blockName));
2195 throw new IllegalArgumentException();
2199 private static SoftReference<Map<String, UnicodeBlock>> mref;
2201 private static String trimBlockName(String name) {
2202 String upper = name.toUpperCase();
2203 StringBuilder result = new StringBuilder(upper.length());
2204 for (int i = 0; i < upper.length(); i++) {
2205 char c = upper.charAt(i);
2206 if (c != ' ' && c != '_' && c != '-') {
2210 return result.toString();
2214 * {icu} Returns the type ID of this Unicode block
2215 * @return integer type ID of this Unicode block
2223 // private data members ---------------------------------------------
2226 * Identification code for this UnicodeBlock
2230 // private constructor ----------------------------------------------
2233 * UnicodeBlock constructor
2234 * @param name name of this UnicodeBlock
2235 * @param id unique id of this UnicodeBlock
2236 * @exception NullPointerException if name is <code>null</code>
2238 private UnicodeBlock(String name, int id)
2249 * East Asian Width constants.
2250 * @see UProperty#EAST_ASIAN_WIDTH
2251 * @see UCharacter#getIntPropertyValue
2254 public static interface EastAsianWidth
2259 public static final int NEUTRAL = 0;
2263 public static final int AMBIGUOUS = 1;
2267 public static final int HALFWIDTH = 2;
2271 public static final int FULLWIDTH = 3;
2275 public static final int NARROW = 4;
2279 public static final int WIDE = 5;
2283 public static final int COUNT = 6;
2287 * Decomposition Type constants.
2288 * @see UProperty#DECOMPOSITION_TYPE
2291 public static interface DecompositionType
2296 public static final int NONE = 0;
2300 public static final int CANONICAL = 1;
2304 public static final int COMPAT = 2;
2308 public static final int CIRCLE = 3;
2312 public static final int FINAL = 4;
2316 public static final int FONT = 5;
2320 public static final int FRACTION = 6;
2324 public static final int INITIAL = 7;
2328 public static final int ISOLATED = 8;
2332 public static final int MEDIAL = 9;
2336 public static final int NARROW = 10;
2340 public static final int NOBREAK = 11;
2344 public static final int SMALL = 12;
2348 public static final int SQUARE = 13;
2352 public static final int SUB = 14;
2356 public static final int SUPER = 15;
2360 public static final int VERTICAL = 16;
2364 public static final int WIDE = 17;
2368 public static final int COUNT = 18;
2372 * Joining Type constants.
2373 * @see UProperty#JOINING_TYPE
2376 public static interface JoiningType
2381 public static final int NON_JOINING = 0;
2385 public static final int JOIN_CAUSING = 1;
2389 public static final int DUAL_JOINING = 2;
2393 public static final int LEFT_JOINING = 3;
2397 public static final int RIGHT_JOINING = 4;
2401 public static final int TRANSPARENT = 5;
2405 public static final int COUNT = 6;
2409 * Joining Group constants.
2410 * @see UProperty#JOINING_GROUP
2413 public static interface JoiningGroup
2418 public static final int NO_JOINING_GROUP = 0;
2422 public static final int AIN = 1;
2426 public static final int ALAPH = 2;
2430 public static final int ALEF = 3;
2434 public static final int BEH = 4;
2438 public static final int BETH = 5;
2442 public static final int DAL = 6;
2446 public static final int DALATH_RISH = 7;
2450 public static final int E = 8;
2454 public static final int FEH = 9;
2458 public static final int FINAL_SEMKATH = 10;
2462 public static final int GAF = 11;
2466 public static final int GAMAL = 12;
2470 public static final int HAH = 13;
2471 /** @stable ICU 4.6 */
2472 public static final int TEH_MARBUTA_GOAL = 14;
2476 public static final int HAMZA_ON_HEH_GOAL = TEH_MARBUTA_GOAL;
2480 public static final int HE = 15;
2484 public static final int HEH = 16;
2488 public static final int HEH_GOAL = 17;
2492 public static final int HETH = 18;
2496 public static final int KAF = 19;
2500 public static final int KAPH = 20;
2504 public static final int KNOTTED_HEH = 21;
2508 public static final int LAM = 22;
2512 public static final int LAMADH = 23;
2516 public static final int MEEM = 24;
2520 public static final int MIM = 25;
2524 public static final int NOON = 26;
2528 public static final int NUN = 27;
2532 public static final int PE = 28;
2536 public static final int QAF = 29;
2540 public static final int QAPH = 30;
2544 public static final int REH = 31;
2548 public static final int REVERSED_PE = 32;
2552 public static final int SAD = 33;
2556 public static final int SADHE = 34;
2560 public static final int SEEN = 35;
2564 public static final int SEMKATH = 36;
2568 public static final int SHIN = 37;
2572 public static final int SWASH_KAF = 38;
2576 public static final int SYRIAC_WAW = 39;
2580 public static final int TAH = 40;
2584 public static final int TAW = 41;
2588 public static final int TEH_MARBUTA = 42;
2592 public static final int TETH = 43;
2596 public static final int WAW = 44;
2600 public static final int YEH = 45;
2604 public static final int YEH_BARREE = 46;
2608 public static final int YEH_WITH_TAIL = 47;
2612 public static final int YUDH = 48;
2616 public static final int YUDH_HE = 49;
2620 public static final int ZAIN = 50;
2624 public static final int FE = 51;
2628 public static final int KHAPH = 52;
2632 public static final int ZHAIN = 53;
2636 public static final int BURUSHASKI_YEH_BARREE = 54;
2637 /** @stable ICU 4.4 */
2638 public static final int FARSI_YEH = 55;
2639 /** @stable ICU 4.4 */
2640 public static final int NYA = 56;
2644 public static final int COUNT = 57;
2648 * Grapheme Cluster Break constants.
2649 * @see UProperty#GRAPHEME_CLUSTER_BREAK
2652 public static interface GraphemeClusterBreak {
2656 public static final int OTHER = 0;
2660 public static final int CONTROL = 1;
2664 public static final int CR = 2;
2668 public static final int EXTEND = 3;
2672 public static final int L = 4;
2676 public static final int LF = 5;
2680 public static final int LV = 6;
2684 public static final int LVT = 7;
2688 public static final int T = 8;
2692 public static final int V = 9;
2696 public static final int SPACING_MARK = 10;
2700 public static final int PREPEND = 11;
2704 public static final int COUNT = 12;
2708 * Word Break constants.
2709 * @see UProperty#WORD_BREAK
2712 public static interface WordBreak {
2716 public static final int OTHER = 0;
2720 public static final int ALETTER = 1;
2724 public static final int FORMAT = 2;
2728 public static final int KATAKANA = 3;
2732 public static final int MIDLETTER = 4;
2736 public static final int MIDNUM = 5;
2740 public static final int NUMERIC = 6;
2744 public static final int EXTENDNUMLET = 7;
2748 public static final int CR = 8;
2752 public static final int EXTEND = 9;
2756 public static final int LF = 10;
2760 public static final int MIDNUMLET = 11;
2764 public static final int NEWLINE = 12;
2768 public static final int COUNT = 13;
2772 * Sentence Break constants.
2773 * @see UProperty#SENTENCE_BREAK
2776 public static interface SentenceBreak {
2780 public static final int OTHER = 0;
2784 public static final int ATERM = 1;
2788 public static final int CLOSE = 2;
2792 public static final int FORMAT = 3;
2796 public static final int LOWER = 4;
2800 public static final int NUMERIC = 5;
2804 public static final int OLETTER = 6;
2808 public static final int SEP = 7;
2812 public static final int SP = 8;
2816 public static final int STERM = 9;
2820 public static final int UPPER = 10;
2824 public static final int CR = 11;
2828 public static final int EXTEND = 12;
2832 public static final int LF = 13;
2836 public static final int SCONTINUE = 14;
2840 public static final int COUNT = 15;
2844 * Line Break constants.
2845 * @see UProperty#LINE_BREAK
2848 public static interface LineBreak
2853 public static final int UNKNOWN = 0;
2857 public static final int AMBIGUOUS = 1;
2861 public static final int ALPHABETIC = 2;
2865 public static final int BREAK_BOTH = 3;
2869 public static final int BREAK_AFTER = 4;
2873 public static final int BREAK_BEFORE = 5;
2877 public static final int MANDATORY_BREAK = 6;
2881 public static final int CONTINGENT_BREAK = 7;
2885 public static final int CLOSE_PUNCTUATION = 8;
2889 public static final int COMBINING_MARK = 9;
2893 public static final int CARRIAGE_RETURN = 10;
2897 public static final int EXCLAMATION = 11;
2901 public static final int GLUE = 12;
2905 public static final int HYPHEN = 13;
2909 public static final int IDEOGRAPHIC = 14;
2914 public static final int INSEPERABLE = 15;
2916 * Renamed from the misspelled "inseperable" in Unicode 4.0.1.
2919 public static final int INSEPARABLE = 15;
2923 public static final int INFIX_NUMERIC = 16;
2927 public static final int LINE_FEED = 17;
2931 public static final int NONSTARTER = 18;
2935 public static final int NUMERIC = 19;
2939 public static final int OPEN_PUNCTUATION = 20;
2943 public static final int POSTFIX_NUMERIC = 21;
2947 public static final int PREFIX_NUMERIC = 22;
2951 public static final int QUOTATION = 23;
2955 public static final int COMPLEX_CONTEXT = 24;
2959 public static final int SURROGATE = 25;
2963 public static final int SPACE = 26;
2967 public static final int BREAK_SYMBOLS = 27;
2971 public static final int ZWSPACE = 28;
2976 public static final int NEXT_LINE = 29; /*[NL]*/
2978 /* from here on: new in Unicode 4/ICU 2.6 */
2983 public static final int WORD_JOINER = 30; /*[WJ]*/
2985 /* from here on: new in Unicode 4.1/ICU 3.4 */
2990 public static final int H2 = 31;
2994 public static final int H3 = 32;
2998 public static final int JL = 33;
3002 public static final int JT = 34;
3006 public static final int JV = 35;
3007 /** @stable ICU 4.4 */
3008 public static final int CLOSE_PARENTHESIS = 36; /*[CP]*/
3010 /* new in Unicode 5.2/ICU 4.4 */
3015 public static final int COUNT = 37;
3019 * Numeric Type constants.
3020 * @see UProperty#NUMERIC_TYPE
3023 public static interface NumericType
3028 public static final int NONE = 0;
3032 public static final int DECIMAL = 1;
3036 public static final int DIGIT = 2;
3040 public static final int NUMERIC = 3;
3044 public static final int COUNT = 4;
3048 * Hangul Syllable Type constants.
3050 * @see UProperty#HANGUL_SYLLABLE_TYPE
3053 public static interface HangulSyllableType
3058 public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
3062 public static final int LEADING_JAMO = 1; /*[L]*/
3066 public static final int VOWEL_JAMO = 2; /*[V]*/
3070 public static final int TRAILING_JAMO = 3; /*[T]*/
3074 public static final int LV_SYLLABLE = 4; /*[LV]*/
3078 public static final int LVT_SYLLABLE = 5; /*[LVT]*/
3082 public static final int COUNT = 6;
3085 // public data members -----------------------------------------------
3088 * The lowest Unicode code point value.
3091 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
3094 * The highest Unicode code point value (scalar value) according to the
3096 * This is a 21-bit value (21 bits, rounded up).<br>
3097 * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
3100 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
3103 * The minimum value for Supplementary code points
3106 public static final int SUPPLEMENTARY_MIN_VALUE =
3107 UTF16.SUPPLEMENTARY_MIN_VALUE;
3110 * Unicode value used when translating into Unicode encoding form and there
3111 * is no existing character.
3114 public static final int REPLACEMENT_CHAR = '\uFFFD';
3117 * Special value that is returned by getUnicodeNumericValue(int) when no
3118 * numeric value is defined for a code point.
3120 * @see #getUnicodeNumericValue
3122 public static final double NO_NUMERIC_VALUE = -123456789;
3125 * Compatibility constant for Java Character's MIN_RADIX.
3128 public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
3131 * Compatibility constant for Java Character's MAX_RADIX.
3134 public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
3137 * Do not lowercase non-initial parts of words when titlecasing.
3138 * Option bit for titlecasing APIs that take an options bit set.
3140 * By default, titlecasing will titlecase the first cased character
3141 * of a word and lowercase all other characters.
3142 * With this option, the other characters will not be modified.
3147 public static final int TITLECASE_NO_LOWERCASE = 0x100;
3150 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
3151 * titlecase exactly the characters at breaks from the iterator.
3152 * Option bit for titlecasing APIs that take an options bit set.
3154 * By default, titlecasing will take each break iterator index,
3155 * adjust it by looking for the next cased character, and titlecase that one.
3156 * Other characters are lowercased.
3158 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
3160 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
3161 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
3162 * cased character F. If F exists, map F to default_title(F); then map each
3163 * subsequent character C to default_lower(C).
3166 * @see #TITLECASE_NO_LOWERCASE
3169 public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
3171 // public methods ----------------------------------------------------
3174 * Returnss the numeric value of a decimal digit code point.
3175 * <br>This method observes the semantics of
3176 * <code>java.lang.Character.digit()</code>. Note that this
3177 * will return positive values for code points for which isDigit
3178 * returns false, just like java.lang.Character.
3179 * <br><em>Semantic Change:</em> In release 1.3.1 and
3180 * prior, this did not treat the European letters as having a
3181 * digit value, and also treated numeric letters and other numbers as
3183 * This has been changed to conform to the java semantics.
3184 * <br>A code point is a valid digit if and only if:
3186 * <li>ch is a decimal digit or one of the european letters, and
3187 * <li>the value of ch is less than the specified radix.
3189 * @param ch the code point to query
3190 * @param radix the radix
3191 * @return the numeric value represented by the code point in the
3192 * specified radix, or -1 if the code point is not a decimal digit
3193 * or if its value is too large for the radix
3196 public static int digit(int ch, int radix)
3198 if (2 <= radix && radix <= 36) {
3199 int value = digit(ch);
3201 // ch is not a decimal digit, try latin letters
3202 value = UCharacterProperty.getEuropeanDigit(ch);
3204 return (value < radix) ? value : -1;
3206 return -1; // invalid radix
3211 * Returnss the numeric value of a decimal digit code point.
3212 * <br>This is a convenience overload of <code>digit(int, int)</code>
3213 * that provides a decimal radix.
3214 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
3215 * treated numeric letters and other numbers as digits. This has
3216 * been changed to conform to the java semantics.
3217 * @param ch the code point to query
3218 * @return the numeric value represented by the code point,
3219 * or -1 if the code point is not a decimal digit or if its
3220 * value is too large for a decimal radix
3223 public static int digit(int ch)
3225 return UCharacterProperty.INSTANCE.digit(ch);
3229 * Returns the numeric value of the code point as a nonnegative
3231 * <br>If the code point does not have a numeric value, then -1 is returned.
3233 * If the code point has a numeric value that cannot be represented as a
3234 * nonnegative integer (for example, a fractional value), then -2 is
3236 * @param ch the code point to query
3237 * @return the numeric value of the code point, or -1 if it has no numeric
3238 * value, or -2 if it has a numeric value that cannot be represented as a
3239 * nonnegative integer
3242 public static int getNumericValue(int ch)
3244 return UCharacterProperty.INSTANCE.getNumericValue(ch);
3248 * {@icu} Returns the numeric value for a Unicode code point as defined in the
3249 * Unicode Character Database.</p>
3250 * <p>A "double" return type is necessary because some numeric values are
3251 * fractions, negative, or too large for int.</p>
3252 * <p>For characters without any numeric values in the Unicode Character
3253 * Database, this function will return NO_NUMERIC_VALUE.</p>
3254 * <p><em>API Change:</em> In release 2.2 and prior, this API has a
3255 * return type int and returns -1 when the argument ch does not have a
3256 * corresponding numeric value. This has been changed to synch with ICU4C
3258 * This corresponds to the ICU4C function u_getNumericValue.
3259 * @param ch Code point to get the numeric value for.
3260 * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
3263 public static double getUnicodeNumericValue(int ch)
3265 return UCharacterProperty.INSTANCE.getUnicodeNumericValue(ch);
3269 * Compatibility override of Java deprecated method. This
3270 * method will always remain deprecated.
3271 * Same as java.lang.Character.isSpace().
3272 * @param ch the code point
3273 * @return true if the code point is a space character as
3274 * defined by java.lang.Character.isSpace.
3275 * @deprecated ICU 3.4 (Java)
3277 public static boolean isSpace(int ch) {
3278 return ch <= 0x20 &&
3279 (ch == 0x20 || ch == 0x09 || ch == 0x0a || ch == 0x0c || ch == 0x0d);
3283 * Returns a value indicating a code point's Unicode category.
3284 * Up-to-date Unicode implementation of java.lang.Character.getType()
3285 * except for the above mentioned code points that had their category
3287 * Return results are constants from the interface
3288 * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
3289 * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
3290 * those returned by java.lang.Character.getType. UCharacterCategory values
3291 * match the ones used in ICU4C, while java.lang.Character type
3292 * values, though similar, skip the value 17.</p>
3293 * @param ch code point whose type is to be determined
3294 * @return category which is a value of UCharacterCategory
3297 public static int getType(int ch)
3299 return UCharacterProperty.INSTANCE.getType(ch);
3303 * Determines if a code point has a defined meaning in the up-to-date
3305 * E.g. supplementary code points though allocated space are not defined in
3307 * Up-to-date Unicode implementation of java.lang.Character.isDefined()
3308 * @param ch code point to be determined if it is defined in the most
3309 * current version of Unicode
3310 * @return true if this code point is defined in unicode
3313 public static boolean isDefined(int ch)
3315 return getType(ch) != 0;
3319 * Determines if a code point is a Java digit.
3320 * <br>This method observes the semantics of
3321 * <code>java.lang.Character.isDigit()</code>. It returns true for decimal
3323 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this treated
3324 * numeric letters and other numbers as digits.
3325 * This has been changed to conform to the java semantics.
3326 * @param ch code point to query
3327 * @return true if this code point is a digit
3330 public static boolean isDigit(int ch)
3332 return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
3336 * Determines if the specified code point is an ISO control character.
3337 * A code point is considered to be an ISO control character if it is in
3338 * the range \u0000 through \u001F or in the range \u007F through
3340 * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
3341 * @param ch code point to determine if it is an ISO control character
3342 * @return true if code point is a ISO control character
3345 public static boolean isISOControl(int ch)
3347 return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
3348 ((ch <= UNIT_SEPARATOR_) || (ch >= DELETE_));
3352 * Determines if the specified code point is a letter.
3353 * Up-to-date Unicode implementation of java.lang.Character.isLetter()
3354 * @param ch code point to determine if it is a letter
3355 * @return true if code point is a letter
3358 public static boolean isLetter(int ch)
3360 // if props == 0, it will just fall through and return false
3361 return ((1 << getType(ch))
3362 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3363 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3364 | (1 << UCharacterCategory.TITLECASE_LETTER)
3365 | (1 << UCharacterCategory.MODIFIER_LETTER)
3366 | (1 << UCharacterCategory.OTHER_LETTER))) != 0;
3370 * Determines if the specified code point is a letter or digit.
3371 * {@icunote} This method, unlike java.lang.Character does not regard the ascii
3372 * characters 'A' - 'Z' and 'a' - 'z' as digits.
3373 * @param ch code point to determine if it is a letter or a digit
3374 * @return true if code point is a letter or a digit
3377 public static boolean isLetterOrDigit(int ch)
3379 return ((1 << getType(ch))
3380 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3381 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3382 | (1 << UCharacterCategory.TITLECASE_LETTER)
3383 | (1 << UCharacterCategory.MODIFIER_LETTER)
3384 | (1 << UCharacterCategory.OTHER_LETTER)
3385 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER))) != 0;
3389 * Compatibility override of Java deprecated method. This
3390 * method will always remain deprecated. Delegates to
3391 * java.lang.Character.isJavaIdentifierStart.
3392 * @param cp the code point
3393 * @return true if the code point can start a java identifier.
3394 * @deprecated ICU 3.4 (Java)
3396 public static boolean isJavaLetter(int cp) {
3397 return isJavaIdentifierStart(cp);
3401 * Compatibility override of Java deprecated method. This
3402 * method will always remain deprecated. Delegates to
3403 * java.lang.Character.isJavaIdentifierPart.
3404 * @param cp the code point
3405 * @return true if the code point can continue a java identifier.
3406 * @deprecated ICU 3.4 (Java)
3408 public static boolean isJavaLetterOrDigit(int cp) {
3409 return isJavaIdentifierPart(cp);
3413 * Compatibility override of Java method, delegates to
3414 * java.lang.Character.isJavaIdentifierStart.
3415 * @param cp the code point
3416 * @return true if the code point can start a java identifier.
3419 public static boolean isJavaIdentifierStart(int cp) {
3420 // note, downcast to char for jdk 1.4 compatibility
3421 return java.lang.Character.isJavaIdentifierStart((char)cp);
3425 * Compatibility override of Java method, delegates to
3426 * java.lang.Character.isJavaIdentifierPart.
3427 * @param cp the code point
3428 * @return true if the code point can continue a java identifier.
3431 public static boolean isJavaIdentifierPart(int cp) {
3432 // note, downcast to char for jdk 1.4 compatibility
3433 return java.lang.Character.isJavaIdentifierPart((char)cp);
3437 * Determines if the specified code point is a lowercase character.
3438 * UnicodeData only contains case mappings for code points where they are
3439 * one-to-one mappings; it also omits information about context-sensitive
3440 * case mappings.<br> For more information about Unicode case mapping
3441 * please refer to the
3442 * <a href=http://www.unicode.org/unicode/reports/tr21/>Technical report
3444 * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
3445 * @param ch code point to determine if it is in lowercase
3446 * @return true if code point is a lowercase character
3449 public static boolean isLowerCase(int ch)
3451 // if props == 0, it will just fall through and return false
3452 return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
3456 * Determines if the specified code point is a white space character.
3457 * A code point is considered to be an whitespace character if and only
3458 * if it satisfies one of the following criteria:
3460 * <li> It is a Unicode Separator character (categories "Z" = "Zs" or "Zl" or "Zp"), but is not
3461 * also a non-breaking space (\u00A0 or \u2007 or \u202F).
3462 * <li> It is \u0009, HORIZONTAL TABULATION.
3463 * <li> It is \u000A, LINE FEED.
3464 * <li> It is \u000B, VERTICAL TABULATION.
3465 * <li> It is \u000C, FORM FEED.
3466 * <li> It is \u000D, CARRIAGE RETURN.
3467 * <li> It is \u001C, FILE SEPARATOR.
3468 * <li> It is \u001D, GROUP SEPARATOR.
3469 * <li> It is \u001E, RECORD SEPARATOR.
3470 * <li> It is \u001F, UNIT SEPARATOR.
3473 * This API tries to sync with the semantics of Java's
3474 * java.lang.Character.isWhitespace(), but it may not return
3475 * the exact same results because of the Unicode version
3477 * <p>Note: Unicode 4.0.1 changed U+200B ZERO WIDTH SPACE from a Space Separator (Zs)
3478 * to a Format Control (Cf). Since then, isWhitespace(0x200b) returns false.
3479 * See http://www.unicode.org/versions/Unicode4.0.1/
3480 * @param ch code point to determine if it is a white space
3481 * @return true if the specified code point is a white space character
3484 public static boolean isWhitespace(int ch)
3486 // exclude no-break spaces
3487 // if props == 0, it will just fall through and return false
3488 return ((1 << getType(ch)) &
3489 ((1 << UCharacterCategory.SPACE_SEPARATOR)
3490 | (1 << UCharacterCategory.LINE_SEPARATOR)
3491 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0
3492 && (ch != NO_BREAK_SPACE_) && (ch != FIGURE_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_)
3493 // TAB VT LF FF CR FS GS RS US NL are all control characters
3494 // that are white spaces.
3495 || (ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f);
3499 * Determines if the specified code point is a Unicode specified space
3500 * character, i.e. if code point is in the category Zs, Zl and Zp.
3501 * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
3502 * @param ch code point to determine if it is a space
3503 * @return true if the specified code point is a space character
3506 public static boolean isSpaceChar(int ch)
3508 // if props == 0, it will just fall through and return false
3509 return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
3510 | (1 << UCharacterCategory.LINE_SEPARATOR)
3511 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR)))
3516 * Determines if the specified code point is a titlecase character.
3517 * UnicodeData only contains case mappings for code points where they are
3518 * one-to-one mappings; it also omits information about context-sensitive
3519 * case mappings.<br>
3520 * For more information about Unicode case mapping please refer to the
3521 * <a href=http://www.unicode.org/unicode/reports/tr21/>
3522 * Technical report #21</a>.<br>
3523 * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
3524 * @param ch code point to determine if it is in title case
3525 * @return true if the specified code point is a titlecase character
3528 public static boolean isTitleCase(int ch)
3530 // if props == 0, it will just fall through and return false
3531 return getType(ch) == UCharacterCategory.TITLECASE_LETTER;
3535 * Determines if the specified code point may be any part of a Unicode
3536 * identifier other than the starting character.
3537 * A code point may be part of a Unicode identifier if and only if it is
3538 * one of the following:
3540 * <li> Lu Uppercase letter
3541 * <li> Ll Lowercase letter
3542 * <li> Lt Titlecase letter
3543 * <li> Lm Modifier letter
3544 * <li> Lo Other letter
3545 * <li> Nl Letter number
3546 * <li> Pc Connecting punctuation character
3547 * <li> Nd decimal number
3548 * <li> Mc Spacing combining mark
3549 * <li> Mn Non-spacing mark
3550 * <li> Cf formatting code
3552 * Up-to-date Unicode implementation of
3553 * java.lang.Character.isUnicodeIdentifierPart().<br>
3554 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3555 * @param ch code point to determine if is can be part of a Unicode
3557 * @return true if code point is any character belonging a unicode
3558 * identifier suffix after the first character
3561 public static boolean isUnicodeIdentifierPart(int ch)
3563 // if props == 0, it will just fall through and return false
3565 return ((1 << getType(ch))
3566 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3567 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3568 | (1 << UCharacterCategory.TITLECASE_LETTER)
3569 | (1 << UCharacterCategory.MODIFIER_LETTER)
3570 | (1 << UCharacterCategory.OTHER_LETTER)
3571 | (1 << UCharacterCategory.LETTER_NUMBER)
3572 | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
3573 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
3574 | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
3575 | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
3576 || isIdentifierIgnorable(ch);
3580 * Determines if the specified code point is permissible as the first
3581 * character in a Unicode identifier.
3582 * A code point may start a Unicode identifier if it is of type either
3584 * <li> Lu Uppercase letter
3585 * <li> Ll Lowercase letter
3586 * <li> Lt Titlecase letter
3587 * <li> Lm Modifier letter
3588 * <li> Lo Other letter
3589 * <li> Nl Letter number
3591 * Up-to-date Unicode implementation of
3592 * java.lang.Character.isUnicodeIdentifierStart().<br>
3593 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3594 * @param ch code point to determine if it can start a Unicode identifier
3595 * @return true if code point is the first character belonging a unicode
3599 public static boolean isUnicodeIdentifierStart(int ch)
3601 /*int cat = getType(ch);*/
3602 // if props == 0, it will just fall through and return false
3603 return ((1 << getType(ch))
3604 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3605 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3606 | (1 << UCharacterCategory.TITLECASE_LETTER)
3607 | (1 << UCharacterCategory.MODIFIER_LETTER)
3608 | (1 << UCharacterCategory.OTHER_LETTER)
3609 | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
3613 * Determines if the specified code point should be regarded as an
3614 * ignorable character in a Java identifier.
3615 * A character is Java-identifier-ignorable if it has the general category
3616 * Cf Formatting Control, or it is a non-Java-whitespace ISO control:
3617 * U+0000..U+0008, U+000E..U+001B, U+007F..U+009F.<br>
3618 * Up-to-date Unicode implementation of
3619 * java.lang.Character.isIdentifierIgnorable().<br>
3620 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3621 * <p>Note that Unicode just recommends to ignore Cf (format controls).
3622 * @param ch code point to be determined if it can be ignored in a Unicode
3624 * @return true if the code point is ignorable
3627 public static boolean isIdentifierIgnorable(int ch)
3629 // see java.lang.Character.isIdentifierIgnorable() on range of
3630 // ignorable characters.
3632 return isISOControl(ch)
3633 && !((ch >= 0x9 && ch <= 0xd)
3634 || (ch >= 0x1c && ch <= 0x1f));
3636 return getType(ch) == UCharacterCategory.FORMAT;
3640 * Determines if the specified code point is an uppercase character.
3641 * UnicodeData only contains case mappings for code point where they are
3642 * one-to-one mappings; it also omits information about context-sensitive
3643 * case mappings.<br>
3644 * For language specific case conversion behavior, use
3645 * toUpperCase(locale, str). <br>
3646 * For example, the case conversion for dot-less i and dotted I in Turkish,
3647 * or for final sigma in Greek.
3648 * For more information about Unicode case mapping please refer to the
3649 * <a href=http://www.unicode.org/unicode/reports/tr21/>
3650 * Technical report #21</a>.<br>
3651 * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
3652 * @param ch code point to determine if it is in uppercase
3653 * @return true if the code point is an uppercase character
3656 public static boolean isUpperCase(int ch)
3658 // if props == 0, it will just fall through and return false
3659 return getType(ch) == UCharacterCategory.UPPERCASE_LETTER;
3663 * The given code point is mapped to its lowercase equivalent; if the code
3664 * point has no lowercase equivalent, the code point itself is returned.
3665 * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
3667 * <p>This function only returns the simple, single-code point case mapping.
3668 * Full case mappings should be used whenever possible because they produce
3669 * better results by working on whole strings.
3670 * They take into account the string context and the language and can map
3671 * to a result string with a different length as appropriate.
3672 * Full case mappings are applied by the case mapping functions
3673 * that take String parameters rather than code points (int).
3674 * See also the User Guide chapter on C/POSIX migration:
3675 * http://www.icu-project.org/userguide/posix.html#case_mappings
3677 * @param ch code point whose lowercase equivalent is to be retrieved
3678 * @return the lowercase equivalent code point
3681 public static int toLowerCase(int ch) {
3682 return UCaseProps.INSTANCE.tolower(ch);
3686 * Converts argument code point and returns a String object representing
3687 * the code point's value in UTF16 format.
3688 * The result is a string whose length is 1 for non-supplementary code
3689 * points, 2 otherwise.<br>
3690 * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
3692 * Up-to-date Unicode implementation of java.lang.Character.toString()
3693 * @param ch code point
3694 * @return string representation of the code point, null if code point is not
3695 * defined in unicode
3698 public static String toString(int ch)
3700 if (ch < MIN_VALUE || ch > MAX_VALUE) {
3704 if (ch < SUPPLEMENTARY_MIN_VALUE) {
3705 return String.valueOf((char)ch);
3708 StringBuilder result = new StringBuilder();
3709 result.append(UTF16.getLeadSurrogate(ch));
3710 result.append(UTF16.getTrailSurrogate(ch));
3711 return result.toString();
3715 * Converts the code point argument to titlecase.
3716 * If no titlecase is available, the uppercase is returned. If no uppercase
3717 * is available, the code point itself is returned.
3718 * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
3720 * <p>This function only returns the simple, single-code point case mapping.
3721 * Full case mappings should be used whenever possible because they produce
3722 * better results by working on whole strings.
3723 * They take into account the string context and the language and can map
3724 * to a result string with a different length as appropriate.
3725 * Full case mappings are applied by the case mapping functions
3726 * that take String parameters rather than code points (int).
3727 * See also the User Guide chapter on C/POSIX migration:
3728 * http://www.icu-project.org/userguide/posix.html#case_mappings
3730 * @param ch code point whose title case is to be retrieved
3731 * @return titlecase code point
3734 public static int toTitleCase(int ch) {
3735 return UCaseProps.INSTANCE.totitle(ch);
3739 * Converts the character argument to uppercase.
3740 * If no uppercase is available, the character itself is returned.
3741 * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
3743 * <p>This function only returns the simple, single-code point case mapping.
3744 * Full case mappings should be used whenever possible because they produce
3745 * better results by working on whole strings.
3746 * They take into account the string context and the language and can map
3747 * to a result string with a different length as appropriate.
3748 * Full case mappings are applied by the case mapping functions
3749 * that take String parameters rather than code points (int).
3750 * See also the User Guide chapter on C/POSIX migration:
3751 * http://www.icu-project.org/userguide/posix.html#case_mappings
3753 * @param ch code point whose uppercase is to be retrieved
3754 * @return uppercase code point
3757 public static int toUpperCase(int ch) {
3758 return UCaseProps.INSTANCE.toupper(ch);
3761 // extra methods not in java.lang.Character --------------------------
3764 * {@icu} Determines if the code point is a supplementary character.
3765 * A code point is a supplementary character if and only if it is greater
3766 * than <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
3767 * @param ch code point to be determined if it is in the supplementary
3769 * @return true if code point is a supplementary character
3772 public static boolean isSupplementary(int ch)
3774 return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
3775 ch <= UCharacter.MAX_VALUE;
3779 * {@icu} Determines if the code point is in the BMP plane.
3780 * @param ch code point to be determined if it is not a supplementary
3782 * @return true if code point is not a supplementary character
3785 public static boolean isBMP(int ch)
3787 return (ch >= 0 && ch <= LAST_CHAR_MASK_);
3791 * {@icu} Determines whether the specified code point is a printable character
3792 * according to the Unicode standard.
3793 * @param ch code point to be determined if it is printable
3794 * @return true if the code point is a printable character
3797 public static boolean isPrintable(int ch)
3799 int cat = getType(ch);
3800 // if props == 0, it will just fall through and return false
3801 return (cat != UCharacterCategory.UNASSIGNED &&
3802 cat != UCharacterCategory.CONTROL &&
3803 cat != UCharacterCategory.FORMAT &&
3804 cat != UCharacterCategory.PRIVATE_USE &&
3805 cat != UCharacterCategory.SURROGATE &&
3806 cat != UCharacterCategory.GENERAL_OTHER_TYPES);
3810 * {@icu} Determines whether the specified code point is of base form.
3811 * A code point of base form does not graphically combine with preceding
3812 * characters, and is neither a control nor a format character.
3813 * @param ch code point to be determined if it is of base form
3814 * @return true if the code point is of base form
3817 public static boolean isBaseForm(int ch)
3819 int cat = getType(ch);
3820 // if props == 0, it will just fall through and return false
3821 return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
3822 cat == UCharacterCategory.OTHER_NUMBER ||
3823 cat == UCharacterCategory.LETTER_NUMBER ||
3824 cat == UCharacterCategory.UPPERCASE_LETTER ||
3825 cat == UCharacterCategory.LOWERCASE_LETTER ||
3826 cat == UCharacterCategory.TITLECASE_LETTER ||
3827 cat == UCharacterCategory.MODIFIER_LETTER ||
3828 cat == UCharacterCategory.OTHER_LETTER ||
3829 cat == UCharacterCategory.NON_SPACING_MARK ||
3830 cat == UCharacterCategory.ENCLOSING_MARK ||
3831 cat == UCharacterCategory.COMBINING_SPACING_MARK;
3835 * {@icu} Returns the Bidirection property of a code point.
3836 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
3838 * Result returned belongs to the interface
3839 * <a href=UCharacterDirection.html>UCharacterDirection</a>
3840 * @param ch the code point to be determined its direction
3841 * @return direction constant from UCharacterDirection.
3844 public static int getDirection(int ch)
3846 return UBiDiProps.INSTANCE.getClass(ch);
3850 * Determines whether the code point has the "mirrored" property.
3851 * This property is set for characters that are commonly used in
3852 * Right-To-Left contexts and need to be displayed with a "mirrored"
3854 * @param ch code point whose mirror is to be determined
3855 * @return true if the code point has the "mirrored" property
3858 public static boolean isMirrored(int ch)
3860 return UBiDiProps.INSTANCE.isMirrored(ch);
3864 * {@icu} Maps the specified code point to a "mirror-image" code point.
3865 * For code points with the "mirrored" property, implementations sometimes
3866 * need a "poor man's" mapping to another code point such that the default
3867 * glyph may serve as the mirror-image of the default glyph of the
3868 * specified code point.<br>
3869 * This is useful for text conversion to and from codepages with visual
3870 * order, and for displays without glyph selection capabilities.
3871 * @param ch code point whose mirror is to be retrieved
3872 * @return another code point that may serve as a mirror-image substitute,
3873 * or ch itself if there is no such mapping or ch does not have the
3874 * "mirrored" property
3877 public static int getMirror(int ch)
3879 return UBiDiProps.INSTANCE.getMirror(ch);
3883 * {@icu} Returns the combining class of the argument codepoint
3884 * @param ch code point whose combining is to be retrieved
3885 * @return the combining class of the codepoint
3888 public static int getCombiningClass(int ch)
3890 if (ch < MIN_VALUE || ch > MAX_VALUE) {
3891 throw new IllegalArgumentException("Codepoint out of bounds");
3893 Normalizer2Impl impl = Norm2AllModes.getNFCInstance().impl;
3894 return impl.getCC(impl.getNorm16(ch));
3898 * {@icu} A code point is illegal if and only if
3900 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3901 * <li> A surrogate value, 0xD800 to 0xDFFF
3902 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3904 * Note: legal does not mean that it is assigned in this version of Unicode.
3905 * @param ch code point to determine if it is a legal code point by itself
3906 * @return true if and only if legal.
3909 public static boolean isLegal(int ch)
3911 if (ch < MIN_VALUE) {
3914 if (ch < UTF16.SURROGATE_MIN_VALUE) {
3917 if (ch <= UTF16.SURROGATE_MAX_VALUE) {
3920 if (UCharacterUtility.isNonCharacter(ch)) {
3923 return (ch <= MAX_VALUE);
3927 * {@icu} A string is legal iff all its code points are legal.
3928 * A code point is illegal if and only if
3930 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3931 * <li> A surrogate value, 0xD800 to 0xDFFF
3932 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3934 * Note: legal does not mean that it is assigned in this version of Unicode.
3935 * @param str containing code points to examin
3936 * @return true if and only if legal.
3939 public static boolean isLegal(String str)
3941 int size = str.length();
3943 for (int i = 0; i < size; i ++)
3945 codepoint = UTF16.charAt(str, i);
3946 if (!isLegal(codepoint)) {
3949 if (isSupplementary(codepoint)) {
3957 * {@icu} Returns the version of Unicode data used.
3958 * @return the unicode version number used
3961 public static VersionInfo getUnicodeVersion()
3963 return UCharacterProperty.INSTANCE.m_unicodeVersion_;
3967 * {@icu} Returns the most current Unicode name of the argument code point, or
3968 * null if the character is unassigned or outside the range
3969 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
3971 * Note calling any methods related to code point names, e.g. get*Name*()
3972 * incurs a one-time initialisation cost to construct the name tables.
3973 * @param ch the code point for which to get the name
3974 * @return most current Unicode name
3977 public static String getName(int ch)
3979 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
3983 * {@icu} Returns the names for each of the characters in a string
3984 * @param s string to format
3985 * @param separator string to go between names
3986 * @return string of names
3989 public static String getName(String s, String separator) {
3990 if (s.length() == 1) { // handle common case
3991 return getName(s.charAt(0));
3994 StringBuilder sb = new StringBuilder();
3995 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
3996 cp = UTF16.charAt(s,i);
3997 if (i != 0) sb.append(separator);
3998 sb.append(UCharacter.getName(cp));
4000 return sb.toString();
4004 * {@icu} Returns the earlier version 1.0 Unicode name of the argument code
4005 * point, or null if the character is unassigned or outside the range
4006 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
4008 * Note calling any methods related to code point names, e.g. get*Name*()
4009 * incurs a one-time initialisation cost to construct the name tables.
4010 * @param ch the code point for which to get the name
4011 * @return version 1.0 Unicode name
4014 public static String getName1_0(int ch)
4016 return UCharacterName.INSTANCE.getName(ch,
4017 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
4021 * {@icu} Returns a name for a valid codepoint. Unlike, getName(int) and
4022 * getName1_0(int), this method will return a name even for codepoints that
4023 * are not assigned a name in UnicodeData.txt.
4025 * The names are returned in the following order.
4027 * <li> Most current Unicode name if there is any
4028 * <li> Unicode 1.0 name if there is any
4029 * <li> Extended name in the form of
4030 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-fffe>
4032 * Note calling any methods related to code point names, e.g. get*Name*()
4033 * incurs a one-time initialisation cost to construct the name tables.
4034 * @param ch the code point for which to get the name
4035 * @return a name for the argument codepoint
4038 public static String getExtendedName(int ch) {
4039 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.EXTENDED_CHAR_NAME);
4043 * {@icu} Returns the corrected name from NameAliases.txt if there is one.
4044 * Returns null if the character is unassigned or outside the range
4045 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
4047 * Note calling any methods related to code point names, e.g. get*Name*()
4048 * incurs a one-time initialisation cost to construct the name tables.
4049 * @param ch the code point for which to get the name alias
4050 * @return Unicode name alias, or null
4053 public static String getNameAlias(int ch)
4055 return UCharacterName.INSTANCE.getName(ch, UCharacterNameChoice.CHAR_NAME_ALIAS);
4059 * {@icu} Returns the ISO 10646 comment for a character.
4060 * The ISO 10646 comment is an informative field in the Unicode Character
4061 * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list.
4063 * Note: Unicode 5.2 removes all ISO comment data, resulting in empty strings
4064 * returned for all characters.
4066 * @param ch The code point for which to get the ISO comment.
4067 * It must be the case that {@code 0 <= ch <= 0x10ffff}.
4068 * @return The ISO comment, or null if there is no comment for this
4072 public static String getISOComment(int ch)
4074 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
4078 String result = UCharacterName.INSTANCE.getGroupName(ch,
4079 UCharacterNameChoice.ISO_COMMENT_);
4084 * {@icu} <p>Finds a Unicode code point by its most current Unicode name and
4085 * return its code point value. All Unicode names are in uppercase.</p>
4086 * Note calling any methods related to code point names, e.g. get*Name*()
4087 * incurs a one-time initialisation cost to construct the name tables.
4088 * @param name most current Unicode character name whose code point is to
4090 * @return code point or -1 if name is not found
4093 public static int getCharFromName(String name){
4094 return UCharacterName.INSTANCE.getCharFromName(
4095 UCharacterNameChoice.UNICODE_CHAR_NAME, name);
4099 * {@icu} <p>Find a Unicode character by its version 1.0 Unicode name and return
4100 * its code point value. All Unicode names are in uppercase.</p>
4101 * Note calling any methods related to code point names, e.g. get*Name*()
4102 * incurs a one-time initialisation cost to construct the name tables.
4103 * @param name Unicode 1.0 code point name whose code point is to
4105 * @return code point or -1 if name is not found
4108 public static int getCharFromName1_0(String name){
4109 return UCharacterName.INSTANCE.getCharFromName(
4110 UCharacterNameChoice.UNICODE_10_CHAR_NAME, name);
4114 * {@icu} <p>Find a Unicode character by either its name and return its code
4115 * point value. All Unicode names are in uppercase.
4116 * Extended names are all lowercase except for numbers and are contained
4117 * within angle brackets.</p>
4118 * The names are searched in the following order
4120 * <li> Most current Unicode name if there is any
4121 * <li> Unicode 1.0 name if there is any
4122 * <li> Extended name in the form of
4123 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-FFFE>
4125 * Note calling any methods related to code point names, e.g. get*Name*()
4126 * incurs a one-time initialisation cost to construct the name tables.
4127 * @param name codepoint name
4128 * @return code point associated with the name or -1 if the name is not
4132 public static int getCharFromExtendedName(String name){
4133 return UCharacterName.INSTANCE.getCharFromName(
4134 UCharacterNameChoice.EXTENDED_CHAR_NAME, name);
4138 * {@icu} <p>Find a Unicode character by its corrected name alias and return
4139 * its code point value. All Unicode names are in uppercase.</p>
4140 * Note calling any methods related to code point names, e.g. get*Name*()
4141 * incurs a one-time initialisation cost to construct the name tables.
4142 * @param name Unicode name alias whose code point is to be returned
4143 * @return code point or -1 if name is not found
4146 public static int getCharFromNameAlias(String name){
4147 return UCharacterName.INSTANCE.getCharFromName(UCharacterNameChoice.CHAR_NAME_ALIAS, name);
4151 * {@icu} Return the Unicode name for a given property, as given in the
4152 * Unicode database file PropertyAliases.txt. Most properties
4153 * have more than one name. The nameChoice determines which one
4156 * In addition, this function maps the property
4157 * UProperty.GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
4158 * "General_Category_Mask". These names are not in
4159 * PropertyAliases.txt.
4161 * @param property UProperty selector.
4163 * @param nameChoice UProperty.NameChoice selector for which name
4164 * to get. All properties have a long name. Most have a short
4165 * name, but some do not. Unicode allows for additional names; if
4166 * present these will be returned by UProperty.NameChoice.LONG + i,
4169 * @return a name, or null if Unicode explicitly defines no name
4170 * ("n/a") for a given property/nameChoice. If a given nameChoice
4171 * throws an exception, then all larger values of nameChoice will
4172 * throw an exception. If null is returned for a given
4173 * nameChoice, then other nameChoice values may return non-null
4176 * @exception IllegalArgumentException thrown if property or
4177 * nameChoice are invalid.
4180 * @see UProperty.NameChoice
4183 public static String getPropertyName(int property,
4185 return UPropertyAliases.INSTANCE.getPropertyName(property, nameChoice);
4189 * {@icu} Return the UProperty selector for a given property name, as
4190 * specified in the Unicode database file PropertyAliases.txt.
4191 * Short, long, and any other variants are recognized.
4193 * In addition, this function maps the synthetic names "gcm" /
4194 * "General_Category_Mask" to the property
4195 * UProperty.GENERAL_CATEGORY_MASK. These names are not in
4196 * PropertyAliases.txt.
4198 * @param propertyAlias the property name to be matched. The name
4199 * is compared using "loose matching" as described in
4200 * PropertyAliases.txt.
4202 * @return a UProperty enum.
4204 * @exception IllegalArgumentException thrown if propertyAlias
4205 * is not recognized.
4210 public static int getPropertyEnum(CharSequence propertyAlias) {
4211 int propEnum = UPropertyAliases.INSTANCE.getPropertyEnum(propertyAlias);
4212 if (propEnum == UProperty.UNDEFINED) {
4213 throw new IllegalIcuArgumentException("Invalid name: " + propertyAlias);
4219 * {@icu} Return the Unicode name for a given property value, as given in
4220 * the Unicode database file PropertyValueAliases.txt. Most
4221 * values have more than one name. The nameChoice determines
4222 * which one is returned.
4224 * Note: Some of the names in PropertyValueAliases.txt can only be
4225 * retrieved using UProperty.GENERAL_CATEGORY_MASK, not
4226 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
4227 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
4228 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
4230 * @param property UProperty selector constant.
4231 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
4232 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
4233 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
4234 * If out of range, null is returned.
4236 * @param value selector for a value for the given property. In
4237 * general, valid values range from 0 up to some maximum. There
4238 * are a few exceptions: (1.) UProperty.BLOCK values begin at the
4239 * non-zero value BASIC_LATIN.getID(). (2.)
4240 * UProperty.CANONICAL_COMBINING_CLASS values are not contiguous
4241 * and range from 0..240. (3.) UProperty.GENERAL_CATEGORY_MASK values
4242 * are mask values produced by left-shifting 1 by
4243 * UCharacter.getType(). This allows grouped categories such as
4244 * [:L:] to be represented. Mask values are non-contiguous.
4246 * @param nameChoice UProperty.NameChoice selector for which name
4247 * to get. All values have a long name. Most have a short name,
4248 * but some do not. Unicode allows for additional names; if
4249 * present these will be returned by UProperty.NameChoice.LONG + i,
4252 * @return a name, or null if Unicode explicitly defines no name
4253 * ("n/a") for a given property/value/nameChoice. If a given
4254 * nameChoice throws an exception, then all larger values of
4255 * nameChoice will throw an exception. If null is returned for a
4256 * given nameChoice, then other nameChoice values may return
4259 * @exception IllegalArgumentException thrown if property, value,
4260 * or nameChoice are invalid.
4263 * @see UProperty.NameChoice
4266 public static String getPropertyValueName(int property,
4270 if ((property == UProperty.CANONICAL_COMBINING_CLASS
4271 || property == UProperty.LEAD_CANONICAL_COMBINING_CLASS
4272 || property == UProperty.TRAIL_CANONICAL_COMBINING_CLASS)
4273 && value >= UCharacter.getIntPropertyMinValue(
4274 UProperty.CANONICAL_COMBINING_CLASS)
4275 && value <= UCharacter.getIntPropertyMaxValue(
4276 UProperty.CANONICAL_COMBINING_CLASS)
4277 && nameChoice >= 0 && nameChoice < UProperty.NameChoice.COUNT) {
4278 // this is hard coded for the valid cc
4279 // because PropertyValueAliases.txt does not contain all of them
4281 return UPropertyAliases.INSTANCE.getPropertyValueName(property, value,
4284 catch (IllegalArgumentException e) {
4288 return UPropertyAliases.INSTANCE.getPropertyValueName(property, value, nameChoice);
4292 * {@icu} Return the property value integer for a given value name, as
4293 * specified in the Unicode database file PropertyValueAliases.txt.
4294 * Short, long, and any other variants are recognized.
4296 * Note: Some of the names in PropertyValueAliases.txt will only be
4297 * recognized with UProperty.GENERAL_CATEGORY_MASK, not
4298 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
4299 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
4300 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
4302 * @param property UProperty selector constant.
4303 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
4304 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
4305 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
4306 * Only these properties can be enumerated.
4308 * @param valueAlias the value name to be matched. The name is
4309 * compared using "loose matching" as described in
4310 * PropertyValueAliases.txt.
4312 * @return a value integer. Note: UProperty.GENERAL_CATEGORY
4313 * values are mask values produced by left-shifting 1 by
4314 * UCharacter.getType(). This allows grouped categories such as
4315 * [:L:] to be represented.
4318 * @throws IllegalArgumentException if property is not a valid UProperty
4319 * selector or valueAlias is not a value of this property
4322 public static int getPropertyValueEnum(int property, CharSequence valueAlias) {
4323 int propEnum = UPropertyAliases.INSTANCE.getPropertyValueEnum(property, valueAlias);
4324 if (propEnum == UProperty.UNDEFINED) {
4325 throw new IllegalIcuArgumentException("Invalid name: " + valueAlias);
4331 * {@icu} Returns a code point corresponding to the two UTF16 characters.
4332 * @param lead the lead char
4333 * @param trail the trail char
4334 * @return code point if surrogate characters are valid.
4335 * @exception IllegalArgumentException thrown when argument characters do
4336 * not form a valid codepoint
4339 public static int getCodePoint(char lead, char trail)
4341 if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
4342 return UCharacterProperty.getRawSupplementary(lead, trail);
4344 throw new IllegalArgumentException("Illegal surrogate characters");
4348 * {@icu} Returns the code point corresponding to the UTF16 character.
4349 * @param char16 the UTF16 character
4350 * @return code point if argument is a valid character.
4351 * @exception IllegalArgumentException thrown when char16 is not a valid
4355 public static int getCodePoint(char char16)
4357 if (UCharacter.isLegal(char16)) {
4360 throw new IllegalArgumentException("Illegal codepoint");
4364 * Implementation of UCaseProps.ContextIterator, iterates over a String.
4365 * See ustrcase.c/utf16_caseContextIterator().
4367 private static class StringContextIterator implements UCaseProps.ContextIterator {
4370 * @param s String to iterate over.
4372 StringContextIterator(String s) {
4375 cpStart=cpLimit=index=0;
4380 * Set the iteration limit for nextCaseMapCP() to an index within the string.
4381 * If the limit parameter is negative or past the string, then the
4382 * string length is restored as the iteration limit.
4384 * This limit does not affect the next() function which always
4385 * iterates to the very end of the string.
4387 * @param lim The iteration limit.
4389 public void setLimit(int lim) {
4390 if(0<=lim && lim<=s.length()) {
4398 * Move to the iteration limit without fetching code points up to there.
4400 public void moveToLimit() {
4401 cpStart=cpLimit=limit;
4405 * Iterate forward through the string to fetch the next code point
4406 * to be case-mapped, and set the context indexes for it.
4407 * Performance optimization, to save on function calls and redundant
4408 * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
4410 * When the iteration limit is reached (and -1 is returned),
4411 * getCPStart() will be at the iteration limit.
4413 * Iteration with next() does not affect the position for nextCaseMapCP().
4415 * @return The next code point to be case-mapped, or <0 when the iteration is done.
4417 public int nextCaseMapCP() {
4420 int c=s.charAt(cpLimit++);
4421 if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
4423 if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
4424 UTF16.TRAIL_SURROGATE_MIN_VALUE<=(c2=s.charAt(cpLimit)) &&
4425 c2<=UTF16.TRAIL_SURROGATE_MAX_VALUE
4427 // supplementary code point
4429 c=UCharacterProperty.getRawSupplementary((char)c, c2);
4430 // else unpaired surrogate code point
4432 // else BMP code point
4441 * Returns the start of the code point that was last returned
4442 * by nextCaseMapCP().
4444 public int getCPStart() {
4449 * Returns the limit of the code point that was last returned
4450 * by nextCaseMapCP().
4452 public int getCPLimit() {
4456 // implement UCaseProps.ContextIterator
4457 // The following code is not used anywhere in this private class
4458 public void reset(int direction) {
4460 /* reset for forward iteration */
4463 } else if(direction<0) {
4464 /* reset for backward iteration */
4468 // not a valid direction
4477 if(dir>0 && index<s.length()) {
4478 c=UTF16.charAt(s, index);
4479 index+=UTF16.getCharCount(c);
4481 } else if(dir<0 && index>0) {
4482 c=UTF16.charAt(s, index-1);
4483 index-=UTF16.getCharCount(c);
4491 protected int index, limit, cpStart, cpLimit;
4492 protected int dir; // 0=initial state >0=forward <0=backward
4496 * Returns the uppercase version of the argument string.
4497 * Casing is dependent on the default locale and context-sensitive.
4498 * @param str source string to be performed on
4499 * @return uppercase version of the argument string
4502 public static String toUpperCase(String str)
4504 return toUpperCase(ULocale.getDefault(), str);
4508 * Returns the lowercase version of the argument string.
4509 * Casing is dependent on the default locale and context-sensitive
4510 * @param str source string to be performed on
4511 * @return lowercase version of the argument string
4514 public static String toLowerCase(String str)
4516 return toLowerCase(ULocale.getDefault(), str);
4520 * <p>Returns the titlecase version of the argument string.</p>
4521 * <p>Position for titlecasing is determined by the argument break
4522 * iterator, hence the user can customize his break iterator for
4523 * a specialized titlecasing. In this case only the forward iteration
4524 * needs to be implemented.
4525 * If the break iterator passed in is null, the default Unicode algorithm
4526 * will be used to determine the titlecase positions.
4528 * <p>Only positions returned by the break iterator will be title cased,
4529 * character in between the positions will all be in lower case.</p>
4530 * <p>Casing is dependent on the default locale and context-sensitive</p>
4531 * @param str source string to be performed on
4532 * @param breakiter break iterator to determine the positions in which
4533 * the character should be title cased.
4534 * @return lowercase version of the argument string
4537 public static String toTitleCase(String str, BreakIterator breakiter)
4539 return toTitleCase(ULocale.getDefault(), str, breakiter);
4543 * Returns the uppercase version of the argument string.
4544 * Casing is dependent on the argument locale and context-sensitive.
4545 * @param locale which string is to be converted in
4546 * @param str source string to be performed on
4547 * @return uppercase version of the argument string
4550 public static String toUpperCase(Locale locale, String str)
4552 return toUpperCase(ULocale.forLocale(locale), str);
4556 * Returns the uppercase version of the argument string.
4557 * Casing is dependent on the argument locale and context-sensitive.
4558 * @param locale which string is to be converted in
4559 * @param str source string to be performed on
4560 * @return uppercase version of the argument string
4563 public static String toUpperCase(ULocale locale, String str) {
4564 StringContextIterator iter = new StringContextIterator(str);
4565 StringBuilder result = new StringBuilder(str.length());
4566 int[] locCache = new int[1];
4569 if (locale == null) {
4570 locale = ULocale.getDefault();
4574 while((c=iter.nextCaseMapCP())>=0) {
4575 c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache);
4577 /* decode the result */
4579 /* (not) original code point */
4581 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4582 /* mapping already appended to result */
4584 /* } else { append single-code point mapping */
4586 result.appendCodePoint(c);
4588 return result.toString();
4592 * Returns the lowercase version of the argument string.
4593 * Casing is dependent on the argument locale and context-sensitive
4594 * @param locale which string is to be converted in
4595 * @param str source string to be performed on
4596 * @return lowercase version of the argument string
4599 public static String toLowerCase(Locale locale, String str)
4601 return toLowerCase(ULocale.forLocale(locale), str);
4605 * Returns the lowercase version of the argument string.
4606 * Casing is dependent on the argument locale and context-sensitive
4607 * @param locale which string is to be converted in
4608 * @param str source string to be performed on
4609 * @return lowercase version of the argument string
4612 public static String toLowerCase(ULocale locale, String str) {
4613 StringContextIterator iter = new StringContextIterator(str);
4614 StringBuilder result = new StringBuilder(str.length());
4615 int[] locCache = new int[1];
4618 if (locale == null) {
4619 locale = ULocale.getDefault();
4623 while((c=iter.nextCaseMapCP())>=0) {
4624 c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache);
4626 /* decode the result */
4628 /* (not) original code point */
4630 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4631 /* mapping already appended to result */
4633 /* } else { append single-code point mapping */
4635 result.appendCodePoint(c);
4637 return result.toString();
4641 * <p>Returns the titlecase version of the argument string.</p>
4642 * <p>Position for titlecasing is determined by the argument break
4643 * iterator, hence the user can customize his break iterator for
4644 * a specialized titlecasing. In this case only the forward iteration
4645 * needs to be implemented.
4646 * If the break iterator passed in is null, the default Unicode algorithm
4647 * will be used to determine the titlecase positions.
4649 * <p>Only positions returned by the break iterator will be title cased,
4650 * character in between the positions will all be in lower case.</p>
4651 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4652 * @param locale which string is to be converted in
4653 * @param str source string to be performed on
4654 * @param breakiter break iterator to determine the positions in which
4655 * the character should be title cased.
4656 * @return lowercase version of the argument string
4659 public static String toTitleCase(Locale locale, String str,
4660 BreakIterator breakiter)
4662 return toTitleCase(ULocale.forLocale(locale), str, breakiter);
4666 * <p>Returns the titlecase version of the argument string.</p>
4667 * <p>Position for titlecasing is determined by the argument break
4668 * iterator, hence the user can customize his break iterator for
4669 * a specialized titlecasing. In this case only the forward iteration
4670 * needs to be implemented.
4671 * If the break iterator passed in is null, the default Unicode algorithm
4672 * will be used to determine the titlecase positions.
4674 * <p>Only positions returned by the break iterator will be title cased,
4675 * character in between the positions will all be in lower case.</p>
4676 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4677 * @param locale which string is to be converted in
4678 * @param str source string to be performed on
4679 * @param titleIter break iterator to determine the positions in which
4680 * the character should be title cased.
4681 * @return lowercase version of the argument string
4684 public static String toTitleCase(ULocale locale, String str,
4685 BreakIterator titleIter) {
4686 return toTitleCase(locale, str, titleIter, 0);
4690 * <p>Returns the titlecase version of the argument string.</p>
4691 * <p>Position for titlecasing is determined by the argument break
4692 * iterator, hence the user can customize his break iterator for
4693 * a specialized titlecasing. In this case only the forward iteration
4694 * needs to be implemented.
4695 * If the break iterator passed in is null, the default Unicode algorithm
4696 * will be used to determine the titlecase positions.
4698 * <p>Only positions returned by the break iterator will be title cased,
4699 * character in between the positions will all be in lower case.</p>
4700 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4701 * @param locale which string is to be converted in
4702 * @param str source string to be performed on
4703 * @param titleIter break iterator to determine the positions in which
4704 * the character should be title cased.
4705 * @param options bit set to modify the titlecasing operation
4706 * @return lowercase version of the argument string
4708 * @see #TITLECASE_NO_LOWERCASE
4709 * @see #TITLECASE_NO_BREAK_ADJUSTMENT
4711 public static String toTitleCase(ULocale locale, String str,
4712 BreakIterator titleIter,
4714 StringContextIterator iter = new StringContextIterator(str);
4715 StringBuilder result = new StringBuilder(str.length());
4716 int[] locCache = new int[1];
4717 int c, nc, srcLength = str.length();
4719 if (locale == null) {
4720 locale = ULocale.getDefault();
4724 if(titleIter == null) {
4725 titleIter = BreakIterator.getWordInstance(locale);
4727 titleIter.setText(str);
4729 int prev, titleStart, index;
4730 boolean isFirstIndex;
4731 boolean isDutch = locale.getLanguage().equals("nl");
4732 boolean FirstIJ = true;
4734 /* set up local variables */
4738 /* titlecasing loop */
4739 while(prev<srcLength) {
4740 /* find next index where to titlecase */
4743 index=titleIter.first();
4745 index=titleIter.next();
4747 if(index==BreakIterator.DONE || index>srcLength) {
4752 * Unicode 4 & 5 section 3.13 Default Case Operations:
4754 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
4755 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
4756 * cased character F. If F exists, map F to default_title(F); then map each
4757 * subsequent character C to default_lower(C).
4759 * In this implementation, segment [prev..index[ into 3 parts:
4760 * a) uncased characters (copy as-is) [prev..titleStart[
4761 * b) first case letter (titlecase) [titleStart..titleLimit[
4762 * c) subsequent characters (lowercase) [titleLimit..index[
4765 /* find and copy uncased characters [prev..titleStart[ */
4766 iter.setLimit(index);
4767 c=iter.nextCaseMapCP();
4768 if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0
4769 && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
4770 while((c=iter.nextCaseMapCP())>=0
4771 && UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
4772 titleStart=iter.getCPStart();
4773 if(prev<titleStart) {
4774 result.append(str, prev, titleStart);
4780 if(titleStart<index) {
4782 /* titlecase c which is from titleStart */
4783 c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, locale, locCache);
4785 /* decode the result and lowercase up to index */
4788 /* (not) original code point */
4790 result.appendCodePoint(c);
4791 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4792 /* mapping already appended to result */
4794 /* append single-code point mapping */
4795 result.appendCodePoint(c);
4798 if((options&TITLECASE_NO_LOWERCASE)!=0) {
4799 /* Optionally just copy the rest of the word unchanged. */
4801 int titleLimit=iter.getCPLimit();
4802 if(titleLimit<index) {
4803 // TODO: With Java 5, this would want to be
4804 // result.append(str, titleLimit, index);
4805 String appendStr = str.substring(titleLimit,index);
4806 /* Special Case - Dutch IJ Titlecasing */
4807 if ( isDutch && c == 0x0049 && appendStr.startsWith("j")) {
4808 appendStr = "J" + appendStr.substring(1);
4810 result.append(appendStr);
4814 } else if((nc=iter.nextCaseMapCP())>=0) {
4815 if (isDutch && (nc == 0x004A || nc == 0x006A)
4816 && (c == 0x0049) && (FirstIJ == true)) {
4820 /* Normal operation: Lowercase the rest of the word. */
4821 c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, locale,
4833 return result.toString();
4837 * {@icu} The given character is mapped to its case folding equivalent according
4838 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4839 * folding equivalent, the character itself is returned.
4841 * <p>This function only returns the simple, single-code point case mapping.
4842 * Full case mappings should be used whenever possible because they produce
4843 * better results by working on whole strings.
4844 * They can map to a result string with a different length as appropriate.
4845 * Full case mappings are applied by the case mapping functions
4846 * that take String parameters rather than code points (int).
4847 * See also the User Guide chapter on C/POSIX migration:
4848 * http://www.icu-project.org/userguide/posix.html#case_mappings
4850 * @param ch the character to be converted
4851 * @param defaultmapping Indicates if all mappings defined in
4852 * CaseFolding.txt is to be used, otherwise the
4853 * mappings for dotted I and dotless i marked with
4854 * 'I' in CaseFolding.txt will be skipped.
4855 * @return the case folding equivalent of the character, if
4856 * any; otherwise the character itself.
4857 * @see #foldCase(String, boolean)
4860 public static int foldCase(int ch, boolean defaultmapping) {
4861 return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
4865 * {@icu} The given string is mapped to its case folding equivalent according to
4866 * UnicodeData.txt and CaseFolding.txt; if any character has no case
4867 * folding equivalent, the character itself is returned.
4868 * "Full", multiple-code point case folding mappings are returned here.
4869 * For "simple" single-code point mappings use the API
4870 * foldCase(int ch, boolean defaultmapping).
4871 * @param str the String to be converted
4872 * @param defaultmapping Indicates if all mappings defined in
4873 * CaseFolding.txt is to be used, otherwise the
4874 * mappings for dotted I and dotless i marked with
4875 * 'I' in CaseFolding.txt will be skipped.
4876 * @return the case folding equivalent of the character, if
4877 * any; otherwise the character itself.
4878 * @see #foldCase(int, boolean)
4881 public static String foldCase(String str, boolean defaultmapping) {
4882 return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
4886 * {@icu} Option value for case folding: use default mappings defined in
4890 public static final int FOLD_CASE_DEFAULT = 0x0000;
4892 * {@icu} Option value for case folding: exclude the mappings for dotted I
4893 * and dotless i marked with 'I' in CaseFolding.txt.
4896 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001;
4899 * {@icu} The given character is mapped to its case folding equivalent according
4900 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4901 * folding equivalent, the character itself is returned.
4903 * <p>This function only returns the simple, single-code point case mapping.
4904 * Full case mappings should be used whenever possible because they produce
4905 * better results by working on whole strings.
4906 * They can map to a result string with a different length as appropriate.
4907 * Full case mappings are applied by the case mapping functions
4908 * that take String parameters rather than code points (int).
4909 * See also the User Guide chapter on C/POSIX migration:
4910 * http://www.icu-project.org/userguide/posix.html#case_mappings
4912 * @param ch the character to be converted
4913 * @param options A bit set for special processing. Currently the recognised options
4914 * are FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4915 * @return the case folding equivalent of the character, if any; otherwise the
4917 * @see #foldCase(String, boolean)
4920 public static int foldCase(int ch, int options) {
4921 return UCaseProps.INSTANCE.fold(ch, options);
4925 * {@icu} The given string is mapped to its case folding equivalent according to
4926 * UnicodeData.txt and CaseFolding.txt; if any character has no case
4927 * folding equivalent, the character itself is returned.
4928 * "Full", multiple-code point case folding mappings are returned here.
4929 * For "simple" single-code point mappings use the API
4930 * foldCase(int ch, boolean defaultmapping).
4931 * @param str the String to be converted
4932 * @param options A bit set for special processing. Currently the recognised options
4933 * are FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4934 * @return the case folding equivalent of the character, if any; otherwise the
4936 * @see #foldCase(int, boolean)
4939 public static final String foldCase(String str, int options) {
4940 StringBuilder result = new StringBuilder(str.length());
4943 length = str.length();
4944 for(i=0; i<length;) {
4945 c=UTF16.charAt(str, i);
4946 i+=UTF16.getCharCount(c);
4947 c = UCaseProps.INSTANCE.toFullFolding(c, result, options);
4949 /* decode the result */
4951 /* (not) original code point */
4953 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4954 /* mapping already appended to result */
4956 /* } else { append single-code point mapping */
4958 result.appendCodePoint(c);
4960 return result.toString();
4964 * {@icu} Return numeric value of Han code points.
4965 * <br> This returns the value of Han 'numeric' code points,
4966 * including those for zero, ten, hundred, thousand, ten thousand,
4967 * and hundred million.
4968 * This includes both the standard and 'checkwriting'
4969 * characters, the 'big circle' zero character, and the standard
4971 * @param ch code point to query
4972 * @return value if it is a Han 'numeric character,' otherwise return -1.
4975 public static int getHanNumericValue(int ch)
4977 // TODO: Are these all covered by Unicode numeric value data?
4980 case IDEOGRAPHIC_NUMBER_ZERO_ :
4981 case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
4982 return 0; // Han Zero
4983 case CJK_IDEOGRAPH_FIRST_ :
4984 case CJK_IDEOGRAPH_COMPLEX_ONE_ :
4985 return 1; // Han One
4986 case CJK_IDEOGRAPH_SECOND_ :
4987 case CJK_IDEOGRAPH_COMPLEX_TWO_ :
4988 return 2; // Han Two
4989 case CJK_IDEOGRAPH_THIRD_ :
4990 case CJK_IDEOGRAPH_COMPLEX_THREE_ :
4991 return 3; // Han Three
4992 case CJK_IDEOGRAPH_FOURTH_ :
4993 case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
4994 return 4; // Han Four
4995 case CJK_IDEOGRAPH_FIFTH_ :
4996 case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
4997 return 5; // Han Five
4998 case CJK_IDEOGRAPH_SIXTH_ :
4999 case CJK_IDEOGRAPH_COMPLEX_SIX_ :
5000 return 6; // Han Six
5001 case CJK_IDEOGRAPH_SEVENTH_ :
5002 case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
5003 return 7; // Han Seven
5004 case CJK_IDEOGRAPH_EIGHTH_ :
5005 case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
5006 return 8; // Han Eight
5007 case CJK_IDEOGRAPH_NINETH_ :
5008 case CJK_IDEOGRAPH_COMPLEX_NINE_ :
5009 return 9; // Han Nine
5010 case CJK_IDEOGRAPH_TEN_ :
5011 case CJK_IDEOGRAPH_COMPLEX_TEN_ :
5013 case CJK_IDEOGRAPH_HUNDRED_ :
5014 case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
5016 case CJK_IDEOGRAPH_THOUSAND_ :
5017 case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
5019 case CJK_IDEOGRAPH_TEN_THOUSAND_ :
5021 case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
5024 return -1; // no value
5028 * {@icu} <p>Returns an iterator for character types, iterating over codepoints.</p>
5029 * Example of use:<br>
5031 * RangeValueIterator iterator = UCharacter.getTypeIterator();
5032 * RangeValueIterator.Element element = new RangeValueIterator.Element();
5033 * while (iterator.next(element)) {
5034 * System.out.println("Codepoint \\u" +
5035 * Integer.toHexString(element.start) +
5036 * " to codepoint \\u" +
5037 * Integer.toHexString(element.limit - 1) +
5038 * " has the character type " +
5042 * @return an iterator
5045 public static RangeValueIterator getTypeIterator()
5047 return new UCharacterTypeIterator();
5050 private static final class UCharacterTypeIterator implements RangeValueIterator {
5051 UCharacterTypeIterator() {
5055 // implements RangeValueIterator
5056 public boolean next(Element element) {
5057 if(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
5058 element.start=range.startCodePoint;
5059 element.limit=range.endCodePoint+1;
5060 element.value=range.value;
5067 // implements RangeValueIterator
5068 public void reset() {
5069 trieIterator=UCharacterProperty.INSTANCE.m_trie_.iterator(MASK_TYPE);
5072 private Iterator<Trie2.Range> trieIterator;
5073 private Trie2.Range range;
5075 private static final class MaskType implements Trie2.ValueMapper {
5076 // Extracts the general category ("character type") from the trie value.
5077 public int map(int value) {
5078 return value & UCharacterProperty.TYPE_MASK;
5081 private static final MaskType MASK_TYPE=new MaskType();
5085 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
5086 * <p>This API only gets the iterator for the modern, most up-to-date
5087 * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
5088 * for extended names use getExtendedNameIterator().</p>
5089 * Example of use:<br>
5091 * ValueIterator iterator = UCharacter.getNameIterator();
5092 * ValueIterator.Element element = new ValueIterator.Element();
5093 * while (iterator.next(element)) {
5094 * System.out.println("Codepoint \\u" +
5095 * Integer.toHexString(element.codepoint) +
5096 * " has the name " + (String)element.value);
5099 * <p>The maximal range which the name iterator iterates is from
5100 * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
5101 * @return an iterator
5104 public static ValueIterator getNameIterator(){
5105 return new UCharacterNameIterator(UCharacterName.INSTANCE,
5106 UCharacterNameChoice.UNICODE_CHAR_NAME);
5110 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
5111 * <p>This API only gets the iterator for the older 1.0 Unicode names.
5112 * For modern, most up-to-date Unicode names use getNameIterator() or
5113 * for extended names use getExtendedNameIterator().</p>
5114 * Example of use:<br>
5116 * ValueIterator iterator = UCharacter.get1_0NameIterator();
5117 * ValueIterator.Element element = new ValueIterator.Element();
5118 * while (iterator.next(element)) {
5119 * System.out.println("Codepoint \\u" +
5120 * Integer.toHexString(element.codepoint) +
5121 * " has the name " + (String)element.value);
5124 * <p>The maximal range which the name iterator iterates is from
5125 * @return an iterator
5128 public static ValueIterator getName1_0Iterator(){
5129 return new UCharacterNameIterator(UCharacterName.INSTANCE,
5130 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
5134 * {@icu} <p>Returns an iterator for character names, iterating over codepoints.</p>
5135 * <p>This API only gets the iterator for the extended names.
5136 * For modern, most up-to-date Unicode names use getNameIterator() or
5137 * for older 1.0 Unicode names use get1_0NameIterator().</p>
5138 * Example of use:<br>
5140 * ValueIterator iterator = UCharacter.getExtendedNameIterator();
5141 * ValueIterator.Element element = new ValueIterator.Element();
5142 * while (iterator.next(element)) {
5143 * System.out.println("Codepoint \\u" +
5144 * Integer.toHexString(element.codepoint) +
5145 * " has the name " + (String)element.value);
5148 * <p>The maximal range which the name iterator iterates is from
5149 * @return an iterator
5152 public static ValueIterator getExtendedNameIterator(){
5153 return new UCharacterNameIterator(UCharacterName.INSTANCE,
5154 UCharacterNameChoice.EXTENDED_CHAR_NAME);
5158 * {@icu} Returns the "age" of the code point.</p>
5159 * <p>The "age" is the Unicode version when the code point was first
5160 * designated (as a non-character or for Private Use) or assigned a
5162 * <p>This can be useful to avoid emitting code points to receiving
5163 * processes that do not accept newer characters.</p>
5164 * <p>The data is from the UCD file DerivedAge.txt.</p>
5165 * @param ch The code point.
5166 * @return the Unicode version number
5169 public static VersionInfo getAge(int ch)
5171 if (ch < MIN_VALUE || ch > MAX_VALUE) {
5172 throw new IllegalArgumentException("Codepoint out of bounds");
5174 return UCharacterProperty.INSTANCE.getAge(ch);
5178 * {@icu} <p>Check a binary Unicode property for a code point.</p>
5179 * <p>Unicode, especially in version 3.2, defines many more properties
5180 * than the original set in UnicodeData.txt.</p>
5181 * <p>This API is intended to reflect Unicode properties as defined in
5182 * the Unicode Character Database (UCD) and Unicode Technical Reports
5184 * <p>For details about the properties see
5185 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
5186 * <p>For names of Unicode properties see the UCD file
5187 * PropertyAliases.txt.</p>
5188 * <p>This API does not check the validity of the codepoint.</p>
5189 * <p>Important: If ICU is built with UCD files from Unicode versions
5190 * below 3.2, then properties marked with "new" are not or
5191 * not fully available.</p>
5192 * @param ch code point to test.
5193 * @param property selector constant from com.ibm.icu.lang.UProperty,
5194 * identifies which binary property to check.
5195 * @return true or false according to the binary Unicode property value
5196 * for ch. Also false if property is out of bounds or if the
5197 * Unicode version does not have data for the property at all, or
5198 * not for this code point.
5199 * @see com.ibm.icu.lang.UProperty
5202 public static boolean hasBinaryProperty(int ch, int property)
5204 return UCharacterProperty.INSTANCE.hasBinaryProperty(ch, property);
5208 * {@icu} <p>Check if a code point has the Alphabetic Unicode property.</p>
5209 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
5210 * <p>Different from UCharacter.isLetter(ch)!</p>
5212 * @param ch codepoint to be tested
5214 public static boolean isUAlphabetic(int ch)
5216 return hasBinaryProperty(ch, UProperty.ALPHABETIC);
5220 * {@icu} <p>Check if a code point has the Lowercase Unicode property.</p>
5221 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
5222 * <p>This is different from UCharacter.isLowerCase(ch)!</p>
5223 * @param ch codepoint to be tested
5226 public static boolean isULowercase(int ch)
5228 return hasBinaryProperty(ch, UProperty.LOWERCASE);
5232 * {@icu} <p>Check if a code point has the Uppercase Unicode property.</p>
5233 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
5234 * <p>This is different from UCharacter.isUpperCase(ch)!</p>
5235 * @param ch codepoint to be tested
5238 public static boolean isUUppercase(int ch)
5240 return hasBinaryProperty(ch, UProperty.UPPERCASE);
5244 * {@icu} <p>Check if a code point has the White_Space Unicode property.</p>
5245 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
5246 * <p>This is different from both UCharacter.isSpace(ch) and
5247 * UCharacter.isWhitespace(ch)!</p>
5248 * @param ch codepoint to be tested
5251 public static boolean isUWhiteSpace(int ch)
5253 return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
5257 * {@icu} <p>Returns the property value for an Unicode property type of a code point.
5258 * Also returns binary and mask property values.</p>
5259 * <p>Unicode, especially in version 3.2, defines many more properties than
5260 * the original set in UnicodeData.txt.</p>
5261 * <p>The properties APIs are intended to reflect Unicode properties as
5262 * defined in the Unicode Character Database (UCD) and Unicode Technical
5263 * Reports (UTR). For details about the properties see
5264 * http://www.unicode.org/.</p>
5265 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
5269 * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
5270 * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
5271 * boolean b = (ideo == 1) ? true : false;
5273 * @param ch code point to test.
5274 * @param type UProperty selector constant, identifies which binary
5275 * property to check. Must be
5276 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5277 * UProperty.INT_START <= type < UProperty.INT_LIMIT or
5278 * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
5279 * @return numeric value that is directly the property value or,
5280 * for enumerated properties, corresponds to the numeric value of
5281 * the enumerated constant of the respective property value
5282 * enumeration type (cast to enum type if necessary).
5283 * Returns 0 or 1 (for false / true) for binary Unicode properties.
5284 * Returns a bit-mask for mask properties.
5285 * Returns 0 if 'type' is out of bounds or if the Unicode version
5286 * does not have data for the property at all, or not for this code
5289 * @see #hasBinaryProperty
5290 * @see #getIntPropertyMinValue
5291 * @see #getIntPropertyMaxValue
5292 * @see #getUnicodeVersion
5295 public static int getIntPropertyValue(int ch, int type)
5297 return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
5300 * {@icu} Returns a string version of the property value.
5301 * @param propertyEnum The property enum value.
5302 * @param codepoint The codepoint value.
5303 * @param nameChoice The choice of the name.
5304 * @return value as string
5306 * @deprecated This API is ICU internal only.
5309 public static String getStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) {
5310 if ((propertyEnum >= UProperty.BINARY_START && propertyEnum < UProperty.BINARY_LIMIT) ||
5311 (propertyEnum >= UProperty.INT_START && propertyEnum < UProperty.INT_LIMIT)) {
5312 return getPropertyValueName(propertyEnum, getIntPropertyValue(codepoint, propertyEnum),
5315 if (propertyEnum == UProperty.NUMERIC_VALUE) {
5316 return String.valueOf(getUnicodeNumericValue(codepoint));
5318 // otherwise must be string property
5319 switch (propertyEnum) {
5320 case UProperty.AGE: return getAge(codepoint).toString();
5321 case UProperty.ISO_COMMENT: return getISOComment(codepoint);
5322 case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(getMirror(codepoint));
5323 case UProperty.CASE_FOLDING: return foldCase(UTF16.valueOf(codepoint), true);
5324 case UProperty.LOWERCASE_MAPPING: return toLowerCase(UTF16.valueOf(codepoint));
5325 case UProperty.NAME: return getName(codepoint);
5326 case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(foldCase(codepoint,true));
5327 case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(toLowerCase(codepoint));
5328 case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(toTitleCase(codepoint));
5329 case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(toUpperCase(codepoint));
5330 case UProperty.TITLECASE_MAPPING: return toTitleCase(UTF16.valueOf(codepoint),null);
5331 case UProperty.UNICODE_1_NAME: return getName1_0(codepoint);
5332 case UProperty.UPPERCASE_MAPPING: return toUpperCase(UTF16.valueOf(codepoint));
5334 throw new IllegalArgumentException("Illegal Property Enum");
5339 * {@icu} Returns the minimum value for an integer/binary Unicode property type.
5340 * Can be used together with UCharacter.getIntPropertyMaxValue(int)
5341 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5342 * @param type UProperty selector constant, identifies which binary
5343 * property to check. Must be
5344 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5345 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5346 * @return Minimum value returned by UCharacter.getIntPropertyValue(int)
5347 * for a Unicode property. 0 if the property
5348 * selector 'type' is out of range.
5350 * @see #hasBinaryProperty
5351 * @see #getUnicodeVersion
5352 * @see #getIntPropertyMaxValue
5353 * @see #getIntPropertyValue
5356 public static int getIntPropertyMinValue(int type){
5358 return 0; // undefined; and: all other properties have a minimum value of 0
5363 * {@icu} Returns the maximum value for an integer/binary Unicode property.
5364 * Can be used together with UCharacter.getIntPropertyMinValue(int)
5365 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5366 * Examples for min/max values (for Unicode 3.2):
5368 * <li> UProperty.BIDI_CLASS: 0/18
5369 * (UCharacterDirection.LEFT_TO_RIGHT/UCharacterDirection.BOUNDARY_NEUTRAL)
5370 * <li> UProperty.SCRIPT: 0/45 (UScript.COMMON/UScript.TAGBANWA)
5371 * <li> UProperty.IDEOGRAPHIC: 0/1 (false/true)
5373 * For undefined UProperty constant values, min/max values will be 0/-1.
5374 * @param type UProperty selector constant, identifies which binary
5375 * property to check. Must be
5376 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5377 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5378 * @return Maximum value returned by u_getIntPropertyValue for a Unicode
5379 * property. <= 0 if the property selector 'type' is out of range.
5381 * @see #hasBinaryProperty
5382 * @see #getUnicodeVersion
5383 * @see #getIntPropertyMaxValue
5384 * @see #getIntPropertyValue
5387 public static int getIntPropertyMaxValue(int type)
5389 return UCharacterProperty.INSTANCE.getIntPropertyMaxValue(type);
5393 * Provide the java.lang.Character forDigit API, for convenience.
5396 public static char forDigit(int digit, int radix) {
5397 return java.lang.Character.forDigit(digit, radix);
5400 // JDK 1.5 API coverage
5403 * Cover the JDK 1.5 API, for convenience.
5404 * @see UTF16#LEAD_SURROGATE_MIN_VALUE
5407 public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
5410 * Cover the JDK 1.5 API, for convenience.
5411 * @see UTF16#LEAD_SURROGATE_MAX_VALUE
5414 public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
5417 * Cover the JDK 1.5 API, for convenience.
5418 * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
5421 public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
5424 * Cover the JDK 1.5 API, for convenience.
5425 * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
5428 public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
5431 * Cover the JDK 1.5 API, for convenience.
5432 * @see UTF16#SURROGATE_MIN_VALUE
5435 public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
5438 * Cover the JDK 1.5 API, for convenience.
5439 * @see UTF16#SURROGATE_MAX_VALUE
5442 public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
5445 * Cover the JDK 1.5 API, for convenience.
5446 * @see UTF16#SUPPLEMENTARY_MIN_VALUE
5449 public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
5452 * Cover the JDK 1.5 API, for convenience.
5453 * @see UTF16#CODEPOINT_MAX_VALUE
5456 public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
5459 * Cover the JDK 1.5 API, for convenience.
5460 * @see UTF16#CODEPOINT_MIN_VALUE
5463 public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
5466 * Cover the JDK 1.5 API, for convenience.
5467 * @param cp the code point to check
5468 * @return true if cp is a valid code point
5471 public static final boolean isValidCodePoint(int cp) {
5472 return cp >= 0 && cp <= MAX_CODE_POINT;
5476 * Cover the JDK 1.5 API, for convenience.
5477 * @param cp the code point to check
5478 * @return true if cp is a supplementary code point
5481 public static final boolean isSupplementaryCodePoint(int cp) {
5482 return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
5483 && cp <= UTF16.CODEPOINT_MAX_VALUE;
5487 * Cover the JDK 1.5 API, for convenience.
5488 * @param ch the char to check
5489 * @return true if ch is a high (lead) surrogate
5492 public static boolean isHighSurrogate(char ch) {
5493 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
5497 * Cover the JDK 1.5 API, for convenience.
5498 * @param ch the char to check
5499 * @return true if ch is a low (trail) surrogate
5502 public static boolean isLowSurrogate(char ch) {
5503 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
5507 * Cover the JDK 1.5 API, for convenience. Return true if the chars
5508 * form a valid surrogate pair.
5509 * @param high the high (lead) char
5510 * @param low the low (trail) char
5511 * @return true if high, low form a surrogate pair
5514 public static final boolean isSurrogatePair(char high, char low) {
5515 return isHighSurrogate(high) && isLowSurrogate(low);
5519 * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
5520 * to represent the code point. This does not check the
5521 * code point for validity.
5522 * @param cp the code point to check
5523 * @return the number of chars needed to represent the code point
5524 * @see UTF16#getCharCount
5527 public static int charCount(int cp) {
5528 return UTF16.getCharCount(cp);
5532 * Cover the JDK 1.5 API, for convenience. Return the code point represented by
5533 * the characters. This does not check the surrogate pair for validity.
5534 * @param high the high (lead) surrogate
5535 * @param low the low (trail) surrogate
5536 * @return the code point formed by the surrogate pair
5539 public static final int toCodePoint(char high, char low) {
5540 return UCharacterProperty.getRawSupplementary(high, low);
5544 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5545 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5546 * API. This examines only the characters at index and index+1.
5547 * @param seq the characters to check
5548 * @param index the index of the first or only char forming the code point
5549 * @return the code point at the index
5552 public static final int codePointAt(CharSequence seq, int index) {
5553 char c1 = seq.charAt(index++);
5554 if (isHighSurrogate(c1)) {
5555 if (index < seq.length()) {
5556 char c2 = seq.charAt(index);
5557 if (isLowSurrogate(c2)) {
5558 return toCodePoint(c1, c2);
5565 //#if defined(ECLIPSE)
5566 //## public static final int codePointAt(String seq, int index) {
5567 //## return codePointAt((CharSequence)seq, index);
5572 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5573 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5574 * API. This examines only the characters at index and index+1.
5575 * @param text the characters to check
5576 * @param index the index of the first or only char forming the code point
5577 * @return the code point at the index
5580 public static final int codePointAt(char[] text, int index) {
5581 char c1 = text[index++];
5582 if (isHighSurrogate(c1)) {
5583 if (index < text.length) {
5584 char c2 = text[index];
5585 if (isLowSurrogate(c2)) {
5586 return toCodePoint(c1, c2);
5594 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5595 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5596 * API. This examines only the characters at index and index+1.
5597 * @param text the characters to check
5598 * @param index the index of the first or only char forming the code point
5599 * @param limit the limit of the valid text
5600 * @return the code point at the index
5603 public static final int codePointAt(char[] text, int index, int limit) {
5604 if (index >= limit || limit > text.length) {
5605 throw new IndexOutOfBoundsException();
5607 char c1 = text[index++];
5608 if (isHighSurrogate(c1)) {
5609 if (index < limit) {
5610 char c2 = text[index];
5611 if (isLowSurrogate(c2)) {
5612 return toCodePoint(c1, c2);
5620 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5621 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5622 * API. This examines only the characters at index-1 and index-2.
5623 * @param seq the characters to check
5624 * @param index the index after the last or only char forming the code point
5625 * @return the code point before the index
5628 public static final int codePointBefore(CharSequence seq, int index) {
5629 char c2 = seq.charAt(--index);
5630 if (isLowSurrogate(c2)) {
5632 char c1 = seq.charAt(--index);
5633 if (isHighSurrogate(c1)) {
5634 return toCodePoint(c1, c2);
5641 //#if defined(ECLIPSE)
5642 //## public static final int codePointBefore(String seq, int index) {
5643 //## return codePointBefore((CharSequence)seq, index);
5648 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5649 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5650 * API. This examines only the characters at index-1 and index-2.
5651 * @param text the characters to check
5652 * @param index the index after the last or only char forming the code point
5653 * @return the code point before the index
5656 public static final int codePointBefore(char[] text, int index) {
5657 char c2 = text[--index];
5658 if (isLowSurrogate(c2)) {
5660 char c1 = text[--index];
5661 if (isHighSurrogate(c1)) {
5662 return toCodePoint(c1, c2);
5670 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5671 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5672 * API. This examines only the characters at index-1 and index-2.
5673 * @param text the characters to check
5674 * @param index the index after the last or only char forming the code point
5675 * @param limit the start of the valid text
5676 * @return the code point before the index
5679 public static final int codePointBefore(char[] text, int index, int limit) {
5680 if (index <= limit || limit < 0) {
5681 throw new IndexOutOfBoundsException();
5683 char c2 = text[--index];
5684 if (isLowSurrogate(c2)) {
5685 if (index > limit) {
5686 char c1 = text[--index];
5687 if (isHighSurrogate(c1)) {
5688 return toCodePoint(c1, c2);
5696 * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
5697 * code point into the destination at the given index.
5698 * @param cp the code point to convert
5699 * @param dst the destination array into which to put the char(s) representing the code point
5700 * @param dstIndex the index at which to put the first (or only) char
5701 * @return the count of the number of chars written (1 or 2)
5702 * @throws IllegalArgumentException if cp is not a valid code point
5705 public static final int toChars(int cp, char[] dst, int dstIndex) {
5707 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5708 dst[dstIndex] = (char)cp;
5711 if (cp <= MAX_CODE_POINT) {
5712 dst[dstIndex] = UTF16.getLeadSurrogate(cp);
5713 dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
5717 throw new IllegalArgumentException();
5721 * Cover the JDK 1.5 API, for convenience. Returns a char array
5722 * representing the code point.
5723 * @param cp the code point to convert
5724 * @return an array containing the char(s) representing the code point
5725 * @throws IllegalArgumentException if cp is not a valid code point
5728 public static final char[] toChars(int cp) {
5730 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5731 return new char[] { (char)cp };
5733 if (cp <= MAX_CODE_POINT) {
5735 UTF16.getLeadSurrogate(cp),
5736 UTF16.getTrailSurrogate(cp)
5740 throw new IllegalArgumentException();
5744 * Cover the JDK API, for convenience. Return a byte representing the directionality of
5747 * {@icunote} Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined
5748 * or out-of-bounds characters.
5750 * {@icunote} The return value must be tested using the constants defined in {@link
5751 * UCharacterDirection} and its interface {@link
5752 * UCharacterEnums.ECharacterDirection} since the values are different from the ones
5753 * defined by <code>java.lang.Character</code>.
5754 * @param cp the code point to check
5755 * @return the directionality of the code point
5756 * @see #getDirection
5759 public static byte getDirectionality(int cp)
5761 return (byte)getDirection(cp);
5765 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5766 * @param text the characters to check
5767 * @param start the start of the range
5768 * @param limit the limit of the range
5769 * @return the number of code points in the range
5772 public static int codePointCount(CharSequence text, int start, int limit) {
5773 if (start < 0 || limit < start || limit > text.length()) {
5774 throw new IndexOutOfBoundsException("start (" + start +
5775 ") or limit (" + limit +
5776 ") invalid or out of range 0, " + text.length());
5779 int len = limit - start;
5780 while (limit > start) {
5781 char ch = text.charAt(--limit);
5782 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
5783 ch = text.charAt(--limit);
5784 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
5793 //#if defined(ECLIPSE)
5794 //## public static int codePointCount(String text, int start, int limit) {
5795 //## return codePointCount((CharSequence)text, start, limit);
5800 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5801 * @param text the characters to check
5802 * @param start the start of the range
5803 * @param limit the limit of the range
5804 * @return the number of code points in the range
5807 public static int codePointCount(char[] text, int start, int limit) {
5808 if (start < 0 || limit < start || limit > text.length) {
5809 throw new IndexOutOfBoundsException("start (" + start +
5810 ") or limit (" + limit +
5811 ") invalid or out of range 0, " + text.length);
5814 int len = limit - start;
5815 while (limit > start) {
5816 char ch = text[--limit];
5817 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
5819 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
5829 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
5830 * @param text the characters to check
5831 * @param index the index to adjust
5832 * @param codePointOffset the number of code points by which to offset the index
5833 * @return the adjusted index
5836 public static int offsetByCodePoints(CharSequence text, int index, int codePointOffset) {
5837 if (index < 0 || index > text.length()) {
5838 throw new IndexOutOfBoundsException("index ( " + index +
5839 ") out of range 0, " + text.length());
5842 if (codePointOffset < 0) {
5843 while (++codePointOffset <= 0) {
5844 char ch = text.charAt(--index);
5845 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
5846 ch = text.charAt(--index);
5847 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
5848 if (++codePointOffset > 0) {
5855 int limit = text.length();
5856 while (--codePointOffset >= 0) {
5857 char ch = text.charAt(index++);
5858 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
5859 ch = text.charAt(index++);
5860 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
5861 if (--codePointOffset < 0) {
5872 //#if defined(ECLIPSE)
5873 //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
5874 //## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
5879 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
5880 * @param text the characters to check
5881 * @param start the start of the range to check
5882 * @param count the length of the range to check
5883 * @param index the index to adjust
5884 * @param codePointOffset the number of code points by which to offset the index
5885 * @return the adjusted index
5888 public static int offsetByCodePoints(char[] text, int start, int count, int index,
5889 int codePointOffset) {
5890 int limit = start + count;
5891 if (start < 0 || limit < start || limit > text.length || index < start || index > limit) {
5892 throw new IndexOutOfBoundsException("index ( " + index +
5893 ") out of range " + start +
5895 " in array 0, " + text.length);
5898 if (codePointOffset < 0) {
5899 while (++codePointOffset <= 0) {
5900 char ch = text[--index];
5901 if (index < start) {
5902 throw new IndexOutOfBoundsException("index ( " + index +
5903 ") < start (" + start +
5906 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > start) {
5908 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
5909 if (++codePointOffset > 0) {
5916 while (--codePointOffset >= 0) {
5917 char ch = text[index++];
5918 if (index > limit) {
5919 throw new IndexOutOfBoundsException("index ( " + index +
5920 ") > limit (" + limit +
5923 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
5925 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
5926 if (--codePointOffset < 0) {
5937 // private variables -------------------------------------------------
5940 * To get the last character out from a data type
5942 private static final int LAST_CHAR_MASK_ = 0xFFFF;
5945 // * To get the last byte out from a data type
5947 // private static final int LAST_BYTE_MASK_ = 0xFF;
5952 // private static final int SHIFT_16_ = 16;
5957 // private static final int SHIFT_24_ = 24;
5962 // private static final int DECIMAL_RADIX_ = 10;
5965 * No break space code point
5967 private static final int NO_BREAK_SPACE_ = 0xA0;
5970 * Figure space code point
5972 private static final int FIGURE_SPACE_ = 0x2007;
5975 * Narrow no break space code point
5977 private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
5980 * Ideographic number zero code point
5982 private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
5985 * CJK Ideograph, First code point
5987 private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
5990 * CJK Ideograph, Second code point
5992 private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
5995 * CJK Ideograph, Third code point
5997 private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
6000 * CJK Ideograph, Fourth code point
6002 private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
6005 * CJK Ideograph, FIFTH code point
6007 private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
6010 * CJK Ideograph, Sixth code point
6012 private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
6015 * CJK Ideograph, Seventh code point
6017 private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
6020 * CJK Ideograph, Eighth code point
6022 private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
6025 * CJK Ideograph, Nineth code point
6027 private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
6030 * Application Program command code point
6032 private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
6035 * Unit separator code point
6037 private static final int UNIT_SEPARATOR_ = 0x001F;
6042 private static final int DELETE_ = 0x007F;
6045 * Han digit characters
6047 private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
6048 private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
6049 private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
6050 private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
6051 private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
6052 private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
6053 private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
6054 private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
6055 private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
6056 private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
6057 private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
6058 private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
6059 private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
6060 private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
6061 private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
6062 private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
6063 private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
6064 private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
6066 // private constructor -----------------------------------------------
6069 * Private constructor to prevent instantiation
6071 private UCharacter()