3 *******************************************************************************
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
5 * others. All Rights Reserved. *
6 *******************************************************************************
9 package com.ibm.icu.lang;
11 import java.io.IOException;
12 import java.lang.ref.SoftReference;
13 import java.util.HashMap;
14 import java.util.Locale;
16 import java.util.MissingResourceException;
18 import com.ibm.icu.impl.UBiDiProps;
19 import com.ibm.icu.impl.UCaseProps;
20 import com.ibm.icu.impl.NormalizerImpl;
21 import com.ibm.icu.impl.UCharacterUtility;
22 import com.ibm.icu.impl.UCharacterName;
23 import com.ibm.icu.impl.UCharacterNameChoice;
24 import com.ibm.icu.impl.UPropertyAliases;
25 import com.ibm.icu.lang.UCharacterEnums.*;
26 import com.ibm.icu.text.BreakIterator;
27 import com.ibm.icu.text.UTF16;
28 import com.ibm.icu.impl.UCharacterProperty;
29 import com.ibm.icu.util.RangeValueIterator;
30 import com.ibm.icu.util.ULocale;
31 import com.ibm.icu.util.ValueIterator;
32 import com.ibm.icu.util.VersionInfo;
36 * The UCharacter class provides extensions to the
37 * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
38 * java.lang.Character</a> class. These extensions provide support for
39 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
40 * class, provide support for supplementary characters (those with code
41 * points above U+FFFF).
42 * Each ICU release supports the latest version of Unicode available at that time.
45 * Code points are represented in these API using ints. While it would be
46 * more convenient in Java to have a separate primitive datatype for them,
47 * ints suffice in the meantime.
50 * To use this class please add the jar file name icu4j.jar to the
51 * class path, since it contains data files which supply the information used
53 * E.g. In Windows <br>
54 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
55 * Otherwise, another method would be to copy the files uprops.dat and
56 * unames.icu from the icu4j source subdirectory
57 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
58 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
61 * Aside from the additions for UTF-16 support, and the updated Unicode
62 * properties, the main differences between UCharacter and Character are:
64 * <li> UCharacter is not designed to be a char wrapper and does not have
65 * APIs to which involves management of that single char.<br>
68 * <li> char charValue(),
69 * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
71 * <li> UCharacter does not include Character APIs that are deprecated, nor
72 * does it include the Java-specific character information, such as
73 * boolean isJavaIdentifierPart(char ch).
74 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
75 * values '10' - '35'. UCharacter also does this in digit and
76 * getNumericValue, to adhere to the java semantics of these
77 * methods. New methods unicodeDigit, and
78 * getUnicodeNumericValue do not treat the above code points
79 * as having numeric values. This is a semantic change from ICU4J 1.3.1.
82 * Further detail differences can be determined from the program
83 * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
84 * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
87 * In addition to Java compatibility functions, which calculate derived properties,
88 * this API provides low-level access to the Unicode Character Database.
91 * Unicode assigns each code point (not just assigned character) values for
93 * Most of them are simple boolean flags, or constants from a small enumerated list.
94 * For some properties, values are strings or other relatively more complex types.
97 * For more information see
98 * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
99 * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
102 * There are also functions that provide easy migration from C/POSIX functions
103 * like isblank(). Their use is generally discouraged because the C/POSIX
104 * standards do not define their semantics beyond the ASCII range, which means
105 * that different implementations exhibit very different behavior.
106 * Instead, Unicode properties should be used directly.
109 * There are also only a few, broad C/POSIX character classes, and they tend
110 * to be used for conflicting purposes. For example, the "isalpha()" class
111 * is sometimes used to determine word boundaries, while a more sophisticated
112 * approach would at least distinguish initial letters from continuation
113 * characters (the latter including combining marks).
114 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
115 * Another example: There is no "istitle()" class for titlecase characters.
118 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
119 * ICU implements them according to the Standard Recommendations in
120 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
121 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
124 * API access for C/POSIX character classes is as follows:
125 * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
126 * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
127 * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
128 * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
129 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
130 * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
131 * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
132 * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
133 * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
134 * - cntrl: getType(c)==CONTROL
135 * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
136 * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
139 * The C/POSIX character classes are also available in UnicodeSet patterns,
140 * using patterns like [:graph:] or \p{graph}.
143 * Note: There are several ICU (and Java) whitespace functions.
145 * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
146 * most of general categories "Z" (separators) + most whitespace ISO controls
147 * (including no-break spaces, but excluding IS1..IS4 and ZWSP)
148 * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
149 * - isSpaceChar: just Z (including no-break spaces)
152 * This class is not subclassable
154 * @author Syn Wee Quek
156 * @see com.ibm.icu.lang.UCharacterEnums
159 public final class UCharacter implements ECharacterCategory, ECharacterDirection
161 // public inner classes ----------------------------------------------
164 * A family of character subsets representing the character blocks in the
165 * Unicode specification, generated from Unicode Data file Blocks.txt.
166 * Character blocks generally define characters used for a specific script
167 * or purpose. A character is contained by at most one Unicode block.
170 public static final class UnicodeBlock extends Character.Subset
172 // block id corresponding to icu4c -----------------------------------
177 public static final int INVALID_CODE_ID = -1;
181 public static final int BASIC_LATIN_ID = 1;
185 public static final int LATIN_1_SUPPLEMENT_ID = 2;
189 public static final int LATIN_EXTENDED_A_ID = 3;
193 public static final int LATIN_EXTENDED_B_ID = 4;
197 public static final int IPA_EXTENSIONS_ID = 5;
201 public static final int SPACING_MODIFIER_LETTERS_ID = 6;
205 public static final int COMBINING_DIACRITICAL_MARKS_ID = 7;
207 * Unicode 3.2 renames this block to "Greek and Coptic".
210 public static final int GREEK_ID = 8;
214 public static final int CYRILLIC_ID = 9;
218 public static final int ARMENIAN_ID = 10;
222 public static final int HEBREW_ID = 11;
226 public static final int ARABIC_ID = 12;
230 public static final int SYRIAC_ID = 13;
234 public static final int THAANA_ID = 14;
238 public static final int DEVANAGARI_ID = 15;
242 public static final int BENGALI_ID = 16;
246 public static final int GURMUKHI_ID = 17;
250 public static final int GUJARATI_ID = 18;
254 public static final int ORIYA_ID = 19;
258 public static final int TAMIL_ID = 20;
262 public static final int TELUGU_ID = 21;
266 public static final int KANNADA_ID = 22;
270 public static final int MALAYALAM_ID = 23;
274 public static final int SINHALA_ID = 24;
278 public static final int THAI_ID = 25;
282 public static final int LAO_ID = 26;
286 public static final int TIBETAN_ID = 27;
290 public static final int MYANMAR_ID = 28;
294 public static final int GEORGIAN_ID = 29;
298 public static final int HANGUL_JAMO_ID = 30;
302 public static final int ETHIOPIC_ID = 31;
306 public static final int CHEROKEE_ID = 32;
310 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID = 33;
314 public static final int OGHAM_ID = 34;
318 public static final int RUNIC_ID = 35;
322 public static final int KHMER_ID = 36;
326 public static final int MONGOLIAN_ID = 37;
330 public static final int LATIN_EXTENDED_ADDITIONAL_ID = 38;
334 public static final int GREEK_EXTENDED_ID = 39;
338 public static final int GENERAL_PUNCTUATION_ID = 40;
342 public static final int SUPERSCRIPTS_AND_SUBSCRIPTS_ID = 41;
346 public static final int CURRENCY_SYMBOLS_ID = 42;
348 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
352 public static final int COMBINING_MARKS_FOR_SYMBOLS_ID = 43;
356 public static final int LETTERLIKE_SYMBOLS_ID = 44;
360 public static final int NUMBER_FORMS_ID = 45;
364 public static final int ARROWS_ID = 46;
368 public static final int MATHEMATICAL_OPERATORS_ID = 47;
372 public static final int MISCELLANEOUS_TECHNICAL_ID = 48;
376 public static final int CONTROL_PICTURES_ID = 49;
380 public static final int OPTICAL_CHARACTER_RECOGNITION_ID = 50;
384 public static final int ENCLOSED_ALPHANUMERICS_ID = 51;
388 public static final int BOX_DRAWING_ID = 52;
392 public static final int BLOCK_ELEMENTS_ID = 53;
396 public static final int GEOMETRIC_SHAPES_ID = 54;
400 public static final int MISCELLANEOUS_SYMBOLS_ID = 55;
404 public static final int DINGBATS_ID = 56;
408 public static final int BRAILLE_PATTERNS_ID = 57;
412 public static final int CJK_RADICALS_SUPPLEMENT_ID = 58;
416 public static final int KANGXI_RADICALS_ID = 59;
420 public static final int IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID = 60;
424 public static final int CJK_SYMBOLS_AND_PUNCTUATION_ID = 61;
428 public static final int HIRAGANA_ID = 62;
432 public static final int KATAKANA_ID = 63;
436 public static final int BOPOMOFO_ID = 64;
440 public static final int HANGUL_COMPATIBILITY_JAMO_ID = 65;
444 public static final int KANBUN_ID = 66;
448 public static final int BOPOMOFO_EXTENDED_ID = 67;
452 public static final int ENCLOSED_CJK_LETTERS_AND_MONTHS_ID = 68;
456 public static final int CJK_COMPATIBILITY_ID = 69;
460 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID = 70;
464 public static final int CJK_UNIFIED_IDEOGRAPHS_ID = 71;
468 public static final int YI_SYLLABLES_ID = 72;
472 public static final int YI_RADICALS_ID = 73;
476 public static final int HANGUL_SYLLABLES_ID = 74;
480 public static final int HIGH_SURROGATES_ID = 75;
484 public static final int HIGH_PRIVATE_USE_SURROGATES_ID = 76;
488 public static final int LOW_SURROGATES_ID = 77;
490 * Same as public static final int PRIVATE_USE.
491 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
492 * and multiple code point ranges had this block.
493 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
494 * and adds separate blocks for the supplementary PUAs.
497 public static final int PRIVATE_USE_AREA_ID = 78;
499 * Same as public static final int PRIVATE_USE_AREA.
500 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
501 * and multiple code point ranges had this block.
502 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
503 * and adds separate blocks for the supplementary PUAs.
506 public static final int PRIVATE_USE_ID = PRIVATE_USE_AREA_ID;
510 public static final int CJK_COMPATIBILITY_IDEOGRAPHS_ID = 79;
514 public static final int ALPHABETIC_PRESENTATION_FORMS_ID = 80;
518 public static final int ARABIC_PRESENTATION_FORMS_A_ID = 81;
522 public static final int COMBINING_HALF_MARKS_ID = 82;
526 public static final int CJK_COMPATIBILITY_FORMS_ID = 83;
530 public static final int SMALL_FORM_VARIANTS_ID = 84;
534 public static final int ARABIC_PRESENTATION_FORMS_B_ID = 85;
538 public static final int SPECIALS_ID = 86;
542 public static final int HALFWIDTH_AND_FULLWIDTH_FORMS_ID = 87;
546 public static final int OLD_ITALIC_ID = 88;
550 public static final int GOTHIC_ID = 89;
554 public static final int DESERET_ID = 90;
558 public static final int BYZANTINE_MUSICAL_SYMBOLS_ID = 91;
562 public static final int MUSICAL_SYMBOLS_ID = 92;
566 public static final int MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID = 93;
570 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID = 94;
574 public static final int
575 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID = 95;
579 public static final int TAGS_ID = 96;
581 // New blocks in Unicode 3.2
584 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
587 public static final int CYRILLIC_SUPPLEMENTARY_ID = 97;
589 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
593 public static final int CYRILLIC_SUPPLEMENT_ID = 97;
597 public static final int TAGALOG_ID = 98;
601 public static final int HANUNOO_ID = 99;
605 public static final int BUHID_ID = 100;
609 public static final int TAGBANWA_ID = 101;
613 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID = 102;
617 public static final int SUPPLEMENTAL_ARROWS_A_ID = 103;
621 public static final int SUPPLEMENTAL_ARROWS_B_ID = 104;
625 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID = 105;
629 public static final int SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID = 106;
633 public static final int KATAKANA_PHONETIC_EXTENSIONS_ID = 107;
637 public static final int VARIATION_SELECTORS_ID = 108;
641 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID = 109;
645 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID = 110;
650 public static final int LIMBU_ID = 111; /*[1900]*/
654 public static final int TAI_LE_ID = 112; /*[1950]*/
658 public static final int KHMER_SYMBOLS_ID = 113; /*[19E0]*/
662 public static final int PHONETIC_EXTENSIONS_ID = 114; /*[1D00]*/
666 public static final int MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID = 115; /*[2B00]*/
670 public static final int YIJING_HEXAGRAM_SYMBOLS_ID = 116; /*[4DC0]*/
674 public static final int LINEAR_B_SYLLABARY_ID = 117; /*[10000]*/
678 public static final int LINEAR_B_IDEOGRAMS_ID = 118; /*[10080]*/
682 public static final int AEGEAN_NUMBERS_ID = 119; /*[10100]*/
686 public static final int UGARITIC_ID = 120; /*[10380]*/
690 public static final int SHAVIAN_ID = 121; /*[10450]*/
694 public static final int OSMANYA_ID = 122; /*[10480]*/
698 public static final int CYPRIOT_SYLLABARY_ID = 123; /*[10800]*/
702 public static final int TAI_XUAN_JING_SYMBOLS_ID = 124; /*[1D300]*/
706 public static final int VARIATION_SELECTORS_SUPPLEMENT_ID = 125; /*[E0100]*/
708 /* New blocks in Unicode 4.1 */
713 public static final int ANCIENT_GREEK_MUSICAL_NOTATION_ID = 126; /*[1D200]*/
718 public static final int ANCIENT_GREEK_NUMBERS_ID = 127; /*[10140]*/
723 public static final int ARABIC_SUPPLEMENT_ID = 128; /*[0750]*/
728 public static final int BUGINESE_ID = 129; /*[1A00]*/
733 public static final int CJK_STROKES_ID = 130; /*[31C0]*/
738 public static final int COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID = 131; /*[1DC0]*/
743 public static final int COPTIC_ID = 132; /*[2C80]*/
748 public static final int ETHIOPIC_EXTENDED_ID = 133; /*[2D80]*/
753 public static final int ETHIOPIC_SUPPLEMENT_ID = 134; /*[1380]*/
758 public static final int GEORGIAN_SUPPLEMENT_ID = 135; /*[2D00]*/
763 public static final int GLAGOLITIC_ID = 136; /*[2C00]*/
768 public static final int KHAROSHTHI_ID = 137; /*[10A00]*/
773 public static final int MODIFIER_TONE_LETTERS_ID = 138; /*[A700]*/
778 public static final int NEW_TAI_LUE_ID = 139; /*[1980]*/
783 public static final int OLD_PERSIAN_ID = 140; /*[103A0]*/
788 public static final int PHONETIC_EXTENSIONS_SUPPLEMENT_ID = 141; /*[1D80]*/
793 public static final int SUPPLEMENTAL_PUNCTUATION_ID = 142; /*[2E00]*/
798 public static final int SYLOTI_NAGRI_ID = 143; /*[A800]*/
803 public static final int TIFINAGH_ID = 144; /*[2D30]*/
808 public static final int VERTICAL_FORMS_ID = 145; /*[FE10]*/
810 /* New blocks in Unicode 5.0 */
815 public static final int NKO_ID = 146; /*[07C0]*/
819 public static final int BALINESE_ID = 147; /*[1B00]*/
823 public static final int LATIN_EXTENDED_C_ID = 148; /*[2C60]*/
827 public static final int LATIN_EXTENDED_D_ID = 149; /*[A720]*/
831 public static final int PHAGS_PA_ID = 150; /*[A840]*/
835 public static final int PHOENICIAN_ID = 151; /*[10900]*/
839 public static final int CUNEIFORM_ID = 152; /*[12000]*/
843 public static final int CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID = 153; /*[12400]*/
847 public static final int COUNTING_ROD_NUMERALS_ID = 154; /*[1D360]*/
852 public static final int SUNDANESE_ID = 155; /* [1B80] */
857 public static final int LEPCHA_ID = 156; /* [1C00] */
862 public static final int OL_CHIKI_ID = 157; /* [1C50] */
867 public static final int CYRILLIC_EXTENDED_A_ID = 158; /* [2DE0] */
872 public static final int VAI_ID = 159; /* [A500] */
877 public static final int CYRILLIC_EXTENDED_B_ID = 160; /* [A640] */
882 public static final int SAURASHTRA_ID = 161; /* [A880] */
887 public static final int KAYAH_LI_ID = 162; /* [A900] */
892 public static final int REJANG_ID = 163; /* [A930] */
897 public static final int CHAM_ID = 164; /* [AA00] */
902 public static final int ANCIENT_SYMBOLS_ID = 165; /* [10190] */
907 public static final int PHAISTOS_DISC_ID = 166; /* [101D0] */
912 public static final int LYCIAN_ID = 167; /* [10280] */
917 public static final int CARIAN_ID = 168; /* [102A0] */
922 public static final int LYDIAN_ID = 169; /* [10920] */
927 public static final int MAHJONG_TILES_ID = 170; /* [1F000] */
932 public static final int DOMINO_TILES_ID = 171; /* [1F030] */
937 public static final int COUNT = 172;
939 // blocks objects ---------------------------------------------------
944 public static final UnicodeBlock NO_BLOCK
945 = new UnicodeBlock("NO_BLOCK", 0);
950 public static final UnicodeBlock BASIC_LATIN
951 = new UnicodeBlock("BASIC_LATIN", BASIC_LATIN_ID);
955 public static final UnicodeBlock LATIN_1_SUPPLEMENT
956 = new UnicodeBlock("LATIN_1_SUPPLEMENT", LATIN_1_SUPPLEMENT_ID);
960 public static final UnicodeBlock LATIN_EXTENDED_A
961 = new UnicodeBlock("LATIN_EXTENDED_A", LATIN_EXTENDED_A_ID);
965 public static final UnicodeBlock LATIN_EXTENDED_B
966 = new UnicodeBlock("LATIN_EXTENDED_B", LATIN_EXTENDED_B_ID);
970 public static final UnicodeBlock IPA_EXTENSIONS
971 = new UnicodeBlock("IPA_EXTENSIONS", IPA_EXTENSIONS_ID);
975 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
976 = new UnicodeBlock("SPACING_MODIFIER_LETTERS", SPACING_MODIFIER_LETTERS_ID);
980 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
981 = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS", COMBINING_DIACRITICAL_MARKS_ID);
983 * Unicode 3.2 renames this block to "Greek and Coptic".
986 public static final UnicodeBlock GREEK
987 = new UnicodeBlock("GREEK", GREEK_ID);
991 public static final UnicodeBlock CYRILLIC
992 = new UnicodeBlock("CYRILLIC", CYRILLIC_ID);
996 public static final UnicodeBlock ARMENIAN
997 = new UnicodeBlock("ARMENIAN", ARMENIAN_ID);
1001 public static final UnicodeBlock HEBREW
1002 = new UnicodeBlock("HEBREW", HEBREW_ID);
1006 public static final UnicodeBlock ARABIC
1007 = new UnicodeBlock("ARABIC", ARABIC_ID);
1011 public static final UnicodeBlock SYRIAC
1012 = new UnicodeBlock("SYRIAC", SYRIAC_ID);
1016 public static final UnicodeBlock THAANA
1017 = new UnicodeBlock("THAANA", THAANA_ID);
1021 public static final UnicodeBlock DEVANAGARI
1022 = new UnicodeBlock("DEVANAGARI", DEVANAGARI_ID);
1026 public static final UnicodeBlock BENGALI
1027 = new UnicodeBlock("BENGALI", BENGALI_ID);
1031 public static final UnicodeBlock GURMUKHI
1032 = new UnicodeBlock("GURMUKHI", GURMUKHI_ID);
1036 public static final UnicodeBlock GUJARATI
1037 = new UnicodeBlock("GUJARATI", GUJARATI_ID);
1041 public static final UnicodeBlock ORIYA
1042 = new UnicodeBlock("ORIYA", ORIYA_ID);
1046 public static final UnicodeBlock TAMIL
1047 = new UnicodeBlock("TAMIL", TAMIL_ID);
1051 public static final UnicodeBlock TELUGU
1052 = new UnicodeBlock("TELUGU", TELUGU_ID);
1056 public static final UnicodeBlock KANNADA
1057 = new UnicodeBlock("KANNADA", KANNADA_ID);
1061 public static final UnicodeBlock MALAYALAM
1062 = new UnicodeBlock("MALAYALAM", MALAYALAM_ID);
1066 public static final UnicodeBlock SINHALA
1067 = new UnicodeBlock("SINHALA", SINHALA_ID);
1071 public static final UnicodeBlock THAI
1072 = new UnicodeBlock("THAI", THAI_ID);
1076 public static final UnicodeBlock LAO
1077 = new UnicodeBlock("LAO", LAO_ID);
1081 public static final UnicodeBlock TIBETAN
1082 = new UnicodeBlock("TIBETAN", TIBETAN_ID);
1086 public static final UnicodeBlock MYANMAR
1087 = new UnicodeBlock("MYANMAR", MYANMAR_ID);
1091 public static final UnicodeBlock GEORGIAN
1092 = new UnicodeBlock("GEORGIAN", GEORGIAN_ID);
1096 public static final UnicodeBlock HANGUL_JAMO
1097 = new UnicodeBlock("HANGUL_JAMO", HANGUL_JAMO_ID);
1101 public static final UnicodeBlock ETHIOPIC
1102 = new UnicodeBlock("ETHIOPIC", ETHIOPIC_ID);
1106 public static final UnicodeBlock CHEROKEE
1107 = new UnicodeBlock("CHEROKEE", CHEROKEE_ID);
1111 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
1112 = new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID);
1116 public static final UnicodeBlock OGHAM
1117 = new UnicodeBlock("OGHAM", OGHAM_ID);
1121 public static final UnicodeBlock RUNIC
1122 = new UnicodeBlock("RUNIC", RUNIC_ID);
1126 public static final UnicodeBlock KHMER
1127 = new UnicodeBlock("KHMER", KHMER_ID);
1131 public static final UnicodeBlock MONGOLIAN
1132 = new UnicodeBlock("MONGOLIAN", MONGOLIAN_ID);
1136 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
1137 = new UnicodeBlock("LATIN_EXTENDED_ADDITIONAL", LATIN_EXTENDED_ADDITIONAL_ID);
1141 public static final UnicodeBlock GREEK_EXTENDED
1142 = new UnicodeBlock("GREEK_EXTENDED", GREEK_EXTENDED_ID);
1146 public static final UnicodeBlock GENERAL_PUNCTUATION
1147 = new UnicodeBlock("GENERAL_PUNCTUATION", GENERAL_PUNCTUATION_ID);
1151 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
1152 = new UnicodeBlock("SUPERSCRIPTS_AND_SUBSCRIPTS", SUPERSCRIPTS_AND_SUBSCRIPTS_ID);
1156 public static final UnicodeBlock CURRENCY_SYMBOLS
1157 = new UnicodeBlock("CURRENCY_SYMBOLS", CURRENCY_SYMBOLS_ID);
1159 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
1163 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
1164 = new UnicodeBlock("COMBINING_MARKS_FOR_SYMBOLS", COMBINING_MARKS_FOR_SYMBOLS_ID);
1168 public static final UnicodeBlock LETTERLIKE_SYMBOLS
1169 = new UnicodeBlock("LETTERLIKE_SYMBOLS", LETTERLIKE_SYMBOLS_ID);
1173 public static final UnicodeBlock NUMBER_FORMS
1174 = new UnicodeBlock("NUMBER_FORMS", NUMBER_FORMS_ID);
1178 public static final UnicodeBlock ARROWS
1179 = new UnicodeBlock("ARROWS", ARROWS_ID);
1183 public static final UnicodeBlock MATHEMATICAL_OPERATORS
1184 = new UnicodeBlock("MATHEMATICAL_OPERATORS", MATHEMATICAL_OPERATORS_ID);
1188 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
1189 = new UnicodeBlock("MISCELLANEOUS_TECHNICAL", MISCELLANEOUS_TECHNICAL_ID);
1193 public static final UnicodeBlock CONTROL_PICTURES
1194 = new UnicodeBlock("CONTROL_PICTURES", CONTROL_PICTURES_ID);
1198 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
1199 = new UnicodeBlock("OPTICAL_CHARACTER_RECOGNITION", OPTICAL_CHARACTER_RECOGNITION_ID);
1203 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
1204 = new UnicodeBlock("ENCLOSED_ALPHANUMERICS", ENCLOSED_ALPHANUMERICS_ID);
1208 public static final UnicodeBlock BOX_DRAWING
1209 = new UnicodeBlock("BOX_DRAWING", BOX_DRAWING_ID);
1213 public static final UnicodeBlock BLOCK_ELEMENTS
1214 = new UnicodeBlock("BLOCK_ELEMENTS", BLOCK_ELEMENTS_ID);
1218 public static final UnicodeBlock GEOMETRIC_SHAPES
1219 = new UnicodeBlock("GEOMETRIC_SHAPES", GEOMETRIC_SHAPES_ID);
1223 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
1224 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS", MISCELLANEOUS_SYMBOLS_ID);
1228 public static final UnicodeBlock DINGBATS
1229 = new UnicodeBlock("DINGBATS", DINGBATS_ID);
1233 public static final UnicodeBlock BRAILLE_PATTERNS
1234 = new UnicodeBlock("BRAILLE_PATTERNS", BRAILLE_PATTERNS_ID);
1238 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
1239 = new UnicodeBlock("CJK_RADICALS_SUPPLEMENT", CJK_RADICALS_SUPPLEMENT_ID);
1243 public static final UnicodeBlock KANGXI_RADICALS
1244 = new UnicodeBlock("KANGXI_RADICALS", KANGXI_RADICALS_ID);
1248 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
1249 = new UnicodeBlock("IDEOGRAPHIC_DESCRIPTION_CHARACTERS", IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID);
1253 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
1254 = new UnicodeBlock("CJK_SYMBOLS_AND_PUNCTUATION", CJK_SYMBOLS_AND_PUNCTUATION_ID);
1258 public static final UnicodeBlock HIRAGANA
1259 = new UnicodeBlock("HIRAGANA", HIRAGANA_ID);
1263 public static final UnicodeBlock KATAKANA
1264 = new UnicodeBlock("KATAKANA", KATAKANA_ID);
1268 public static final UnicodeBlock BOPOMOFO
1269 = new UnicodeBlock("BOPOMOFO", BOPOMOFO_ID);
1273 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
1274 = new UnicodeBlock("HANGUL_COMPATIBILITY_JAMO", HANGUL_COMPATIBILITY_JAMO_ID);
1278 public static final UnicodeBlock KANBUN
1279 = new UnicodeBlock("KANBUN", KANBUN_ID);
1283 public static final UnicodeBlock BOPOMOFO_EXTENDED
1284 = new UnicodeBlock("BOPOMOFO_EXTENDED", BOPOMOFO_EXTENDED_ID);
1288 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
1289 = new UnicodeBlock("ENCLOSED_CJK_LETTERS_AND_MONTHS", ENCLOSED_CJK_LETTERS_AND_MONTHS_ID);
1293 public static final UnicodeBlock CJK_COMPATIBILITY
1294 = new UnicodeBlock("CJK_COMPATIBILITY", CJK_COMPATIBILITY_ID);
1298 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1299 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID);
1303 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
1304 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS", CJK_UNIFIED_IDEOGRAPHS_ID);
1308 public static final UnicodeBlock YI_SYLLABLES
1309 = new UnicodeBlock("YI_SYLLABLES", YI_SYLLABLES_ID);
1313 public static final UnicodeBlock YI_RADICALS
1314 = new UnicodeBlock("YI_RADICALS", YI_RADICALS_ID);
1318 public static final UnicodeBlock HANGUL_SYLLABLES
1319 = new UnicodeBlock("HANGUL_SYLLABLES", HANGUL_SYLLABLES_ID);
1323 public static final UnicodeBlock HIGH_SURROGATES
1324 = new UnicodeBlock("HIGH_SURROGATES", HIGH_SURROGATES_ID);
1328 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
1329 = new UnicodeBlock("HIGH_PRIVATE_USE_SURROGATES", HIGH_PRIVATE_USE_SURROGATES_ID);
1333 public static final UnicodeBlock LOW_SURROGATES
1334 = new UnicodeBlock("LOW_SURROGATES", LOW_SURROGATES_ID);
1336 * Same as public static final int PRIVATE_USE.
1337 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1338 * and multiple code point ranges had this block.
1339 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1340 * and adds separate blocks for the supplementary PUAs.
1343 public static final UnicodeBlock PRIVATE_USE_AREA
1344 = new UnicodeBlock("PRIVATE_USE_AREA", 78);
1346 * Same as public static final int PRIVATE_USE_AREA.
1347 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
1348 * and multiple code point ranges had this block.
1349 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
1350 * and adds separate blocks for the supplementary PUAs.
1353 public static final UnicodeBlock PRIVATE_USE
1358 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
1359 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS", CJK_COMPATIBILITY_IDEOGRAPHS_ID);
1363 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
1364 = new UnicodeBlock("ALPHABETIC_PRESENTATION_FORMS", ALPHABETIC_PRESENTATION_FORMS_ID);
1368 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
1369 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_A", ARABIC_PRESENTATION_FORMS_A_ID);
1373 public static final UnicodeBlock COMBINING_HALF_MARKS
1374 = new UnicodeBlock("COMBINING_HALF_MARKS", COMBINING_HALF_MARKS_ID);
1378 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
1379 = new UnicodeBlock("CJK_COMPATIBILITY_FORMS", CJK_COMPATIBILITY_FORMS_ID);
1383 public static final UnicodeBlock SMALL_FORM_VARIANTS
1384 = new UnicodeBlock("SMALL_FORM_VARIANTS", SMALL_FORM_VARIANTS_ID);
1388 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
1389 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_B", ARABIC_PRESENTATION_FORMS_B_ID);
1393 public static final UnicodeBlock SPECIALS
1394 = new UnicodeBlock("SPECIALS", SPECIALS_ID);
1398 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
1399 = new UnicodeBlock("HALFWIDTH_AND_FULLWIDTH_FORMS", HALFWIDTH_AND_FULLWIDTH_FORMS_ID);
1403 public static final UnicodeBlock OLD_ITALIC
1404 = new UnicodeBlock("OLD_ITALIC", OLD_ITALIC_ID);
1408 public static final UnicodeBlock GOTHIC
1409 = new UnicodeBlock("GOTHIC", GOTHIC_ID);
1413 public static final UnicodeBlock DESERET
1414 = new UnicodeBlock("DESERET", DESERET_ID);
1418 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
1419 = new UnicodeBlock("BYZANTINE_MUSICAL_SYMBOLS", BYZANTINE_MUSICAL_SYMBOLS_ID);
1423 public static final UnicodeBlock MUSICAL_SYMBOLS
1424 = new UnicodeBlock("MUSICAL_SYMBOLS", MUSICAL_SYMBOLS_ID);
1428 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
1429 = new UnicodeBlock("MATHEMATICAL_ALPHANUMERIC_SYMBOLS", MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID);
1433 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1434 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID);
1438 public static final UnicodeBlock
1439 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
1440 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID);
1444 public static final UnicodeBlock TAGS
1445 = new UnicodeBlock("TAGS", TAGS_ID);
1447 // New blocks in Unicode 3.2
1450 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1453 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
1454 = new UnicodeBlock("CYRILLIC_SUPPLEMENTARY", CYRILLIC_SUPPLEMENTARY_ID);
1456 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
1459 public static final UnicodeBlock CYRILLIC_SUPPLEMENT
1460 = new UnicodeBlock("CYRILLIC_SUPPLEMENT", CYRILLIC_SUPPLEMENT_ID);
1464 public static final UnicodeBlock TAGALOG
1465 = new UnicodeBlock("TAGALOG", TAGALOG_ID);
1469 public static final UnicodeBlock HANUNOO
1470 = new UnicodeBlock("HANUNOO", HANUNOO_ID);
1474 public static final UnicodeBlock BUHID
1475 = new UnicodeBlock("BUHID", BUHID_ID);
1479 public static final UnicodeBlock TAGBANWA
1480 = new UnicodeBlock("TAGBANWA", TAGBANWA_ID);
1484 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
1485 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID);
1489 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
1490 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_A", SUPPLEMENTAL_ARROWS_A_ID);
1494 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
1495 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_B", SUPPLEMENTAL_ARROWS_B_ID);
1499 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
1500 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID);
1504 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
1505 = new UnicodeBlock("SUPPLEMENTAL_MATHEMATICAL_OPERATORS", SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID);
1509 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
1510 = new UnicodeBlock("KATAKANA_PHONETIC_EXTENSIONS", KATAKANA_PHONETIC_EXTENSIONS_ID);
1514 public static final UnicodeBlock VARIATION_SELECTORS
1515 = new UnicodeBlock("VARIATION_SELECTORS", VARIATION_SELECTORS_ID);
1519 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
1520 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_A", SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID);
1524 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
1525 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_B", SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID);
1530 public static final UnicodeBlock LIMBU
1531 = new UnicodeBlock("LIMBU", LIMBU_ID);
1535 public static final UnicodeBlock TAI_LE
1536 = new UnicodeBlock("TAI_LE", TAI_LE_ID);
1540 public static final UnicodeBlock KHMER_SYMBOLS
1541 = new UnicodeBlock("KHMER_SYMBOLS", KHMER_SYMBOLS_ID);
1546 public static final UnicodeBlock PHONETIC_EXTENSIONS
1547 = new UnicodeBlock("PHONETIC_EXTENSIONS", PHONETIC_EXTENSIONS_ID);
1552 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
1553 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS_AND_ARROWS", MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID);
1557 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
1558 = new UnicodeBlock("YIJING_HEXAGRAM_SYMBOLS", YIJING_HEXAGRAM_SYMBOLS_ID);
1562 public static final UnicodeBlock LINEAR_B_SYLLABARY
1563 = new UnicodeBlock("LINEAR_B_SYLLABARY", LINEAR_B_SYLLABARY_ID);
1567 public static final UnicodeBlock LINEAR_B_IDEOGRAMS
1568 = new UnicodeBlock("LINEAR_B_IDEOGRAMS", LINEAR_B_IDEOGRAMS_ID);
1572 public static final UnicodeBlock AEGEAN_NUMBERS
1573 = new UnicodeBlock("AEGEAN_NUMBERS", AEGEAN_NUMBERS_ID);
1577 public static final UnicodeBlock UGARITIC
1578 = new UnicodeBlock("UGARITIC", UGARITIC_ID);
1582 public static final UnicodeBlock SHAVIAN
1583 = new UnicodeBlock("SHAVIAN", SHAVIAN_ID);
1587 public static final UnicodeBlock OSMANYA
1588 = new UnicodeBlock("OSMANYA", OSMANYA_ID);
1592 public static final UnicodeBlock CYPRIOT_SYLLABARY
1593 = new UnicodeBlock("CYPRIOT_SYLLABARY", CYPRIOT_SYLLABARY_ID);
1597 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
1598 = new UnicodeBlock("TAI_XUAN_JING_SYMBOLS", TAI_XUAN_JING_SYMBOLS_ID);
1603 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
1604 = new UnicodeBlock("VARIATION_SELECTORS_SUPPLEMENT", VARIATION_SELECTORS_SUPPLEMENT_ID);
1606 /* New blocks in Unicode 4.1 */
1611 public static final UnicodeBlock ANCIENT_GREEK_MUSICAL_NOTATION = new UnicodeBlock("ANCIENT_GREEK_MUSICAL_NOTATION", ANCIENT_GREEK_MUSICAL_NOTATION_ID); /*[1D200]*/
1616 public static final UnicodeBlock ANCIENT_GREEK_NUMBERS = new UnicodeBlock("ANCIENT_GREEK_NUMBERS", ANCIENT_GREEK_NUMBERS_ID); /*[10140]*/
1621 public static final UnicodeBlock ARABIC_SUPPLEMENT = new UnicodeBlock("ARABIC_SUPPLEMENT", ARABIC_SUPPLEMENT_ID); /*[0750]*/
1626 public static final UnicodeBlock BUGINESE = new UnicodeBlock("BUGINESE", BUGINESE_ID); /*[1A00]*/
1631 public static final UnicodeBlock CJK_STROKES = new UnicodeBlock("CJK_STROKES", CJK_STROKES_ID); /*[31C0]*/
1636 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_SUPPLEMENT", COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID); /*[1DC0]*/
1641 public static final UnicodeBlock COPTIC = new UnicodeBlock("COPTIC", COPTIC_ID); /*[2C80]*/
1646 public static final UnicodeBlock ETHIOPIC_EXTENDED = new UnicodeBlock("ETHIOPIC_EXTENDED", ETHIOPIC_EXTENDED_ID); /*[2D80]*/
1651 public static final UnicodeBlock ETHIOPIC_SUPPLEMENT = new UnicodeBlock("ETHIOPIC_SUPPLEMENT", ETHIOPIC_SUPPLEMENT_ID); /*[1380]*/
1656 public static final UnicodeBlock GEORGIAN_SUPPLEMENT = new UnicodeBlock("GEORGIAN_SUPPLEMENT", GEORGIAN_SUPPLEMENT_ID); /*[2D00]*/
1661 public static final UnicodeBlock GLAGOLITIC = new UnicodeBlock("GLAGOLITIC", GLAGOLITIC_ID); /*[2C00]*/
1666 public static final UnicodeBlock KHAROSHTHI = new UnicodeBlock("KHAROSHTHI", KHAROSHTHI_ID); /*[10A00]*/
1671 public static final UnicodeBlock MODIFIER_TONE_LETTERS = new UnicodeBlock("MODIFIER_TONE_LETTERS", MODIFIER_TONE_LETTERS_ID); /*[A700]*/
1676 public static final UnicodeBlock NEW_TAI_LUE = new UnicodeBlock("NEW_TAI_LUE", NEW_TAI_LUE_ID); /*[1980]*/
1681 public static final UnicodeBlock OLD_PERSIAN = new UnicodeBlock("OLD_PERSIAN", OLD_PERSIAN_ID); /*[103A0]*/
1686 public static final UnicodeBlock PHONETIC_EXTENSIONS_SUPPLEMENT = new UnicodeBlock("PHONETIC_EXTENSIONS_SUPPLEMENT", PHONETIC_EXTENSIONS_SUPPLEMENT_ID); /*[1D80]*/
1691 public static final UnicodeBlock SUPPLEMENTAL_PUNCTUATION = new UnicodeBlock("SUPPLEMENTAL_PUNCTUATION", SUPPLEMENTAL_PUNCTUATION_ID); /*[2E00]*/
1696 public static final UnicodeBlock SYLOTI_NAGRI = new UnicodeBlock("SYLOTI_NAGRI", SYLOTI_NAGRI_ID); /*[A800]*/
1701 public static final UnicodeBlock TIFINAGH = new UnicodeBlock("TIFINAGH", TIFINAGH_ID); /*[2D30]*/
1706 public static final UnicodeBlock VERTICAL_FORMS = new UnicodeBlock("VERTICAL_FORMS", VERTICAL_FORMS_ID); /*[FE10]*/
1711 public static final UnicodeBlock NKO = new UnicodeBlock("NKO", NKO_ID); /*[07C0]*/
1715 public static final UnicodeBlock BALINESE = new UnicodeBlock("BALINESE", BALINESE_ID); /*[1B00]*/
1719 public static final UnicodeBlock LATIN_EXTENDED_C = new UnicodeBlock("LATIN_EXTENDED_C", LATIN_EXTENDED_C_ID); /*[2C60]*/
1723 public static final UnicodeBlock LATIN_EXTENDED_D = new UnicodeBlock("LATIN_EXTENDED_D", LATIN_EXTENDED_D_ID); /*[A720]*/
1727 public static final UnicodeBlock PHAGS_PA = new UnicodeBlock("PHAGS_PA", PHAGS_PA_ID); /*[A840]*/
1731 public static final UnicodeBlock PHOENICIAN = new UnicodeBlock("PHOENICIAN", PHOENICIAN_ID); /*[10900]*/
1735 public static final UnicodeBlock CUNEIFORM = new UnicodeBlock("CUNEIFORM", CUNEIFORM_ID); /*[12000]*/
1739 public static final UnicodeBlock CUNEIFORM_NUMBERS_AND_PUNCTUATION = new UnicodeBlock("CUNEIFORM_NUMBERS_AND_PUNCTUATION", CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID); /*[12400]*/
1743 public static final UnicodeBlock COUNTING_ROD_NUMERALS = new UnicodeBlock("COUNTING_ROD_NUMERALS", COUNTING_ROD_NUMERALS_ID); /*[1D360]*/
1748 public static final UnicodeBlock SUNDANESE = new UnicodeBlock("SUNDANESE", SUNDANESE_ID); /* [1B80] */
1753 public static final UnicodeBlock LEPCHA = new UnicodeBlock("LEPCHA", LEPCHA_ID); /* [1C00] */
1758 public static final UnicodeBlock OL_CHIKI = new UnicodeBlock("OL_CHIKI", OL_CHIKI_ID); /* [1C50] */
1763 public static final UnicodeBlock CYRILLIC_EXTENDED_A = new UnicodeBlock("CYRILLIC_EXTENDED_A", CYRILLIC_EXTENDED_A_ID); /* [2DE0] */
1768 public static final UnicodeBlock VAI = new UnicodeBlock("VAI", VAI_ID); /* [A500] */
1773 public static final UnicodeBlock CYRILLIC_EXTENDED_B = new UnicodeBlock("CYRILLIC_EXTENDED_B", CYRILLIC_EXTENDED_B_ID); /* [A640] */
1778 public static final UnicodeBlock SAURASHTRA = new UnicodeBlock("SAURASHTRA", SAURASHTRA_ID); /* [A880] */
1783 public static final UnicodeBlock KAYAH_LI = new UnicodeBlock("KAYAH_LI", KAYAH_LI_ID); /* [A900] */
1788 public static final UnicodeBlock REJANG = new UnicodeBlock("REJANG", REJANG_ID); /* [A930] */
1793 public static final UnicodeBlock CHAM = new UnicodeBlock("CHAM", CHAM_ID); /* [AA00] */
1798 public static final UnicodeBlock ANCIENT_SYMBOLS = new UnicodeBlock("ANCIENT_SYMBOLS", ANCIENT_SYMBOLS_ID); /* [10190] */
1803 public static final UnicodeBlock PHAISTOS_DISC = new UnicodeBlock("PHAISTOS_DISC", PHAISTOS_DISC_ID); /* [101D0] */
1808 public static final UnicodeBlock LYCIAN = new UnicodeBlock("LYCIAN", LYCIAN_ID); /* [10280] */
1813 public static final UnicodeBlock CARIAN = new UnicodeBlock("CARIAN", CARIAN_ID); /* [102A0] */
1818 public static final UnicodeBlock LYDIAN = new UnicodeBlock("LYDIAN", LYDIAN_ID); /* [10920] */
1823 public static final UnicodeBlock MAHJONG_TILES = new UnicodeBlock("MAHJONG_TILES", MAHJONG_TILES_ID); /* [1F000] */
1828 public static final UnicodeBlock DOMINO_TILES = new UnicodeBlock("DOMINO_TILES", DOMINO_TILES_ID); /* [1F030] */
1832 public static final UnicodeBlock INVALID_CODE
1833 = new UnicodeBlock("INVALID_CODE", INVALID_CODE_ID);
1835 // public methods --------------------------------------------------
1838 * Gets the only instance of the UnicodeBlock with the argument ID.
1839 * If no such ID exists, a INVALID_CODE UnicodeBlock will be returned.
1840 * @param id UnicodeBlock ID
1841 * @return the only instance of the UnicodeBlock with the argument ID
1842 * if it exists, otherwise a INVALID_CODE UnicodeBlock will be
1846 public static UnicodeBlock getInstance(int id)
1848 if (id >= 0 && id < BLOCKS_.length) {
1851 return INVALID_CODE;
1855 * Returns the Unicode allocation block that contains the code point,
1856 * or null if the code point is not a member of a defined block.
1857 * @param ch code point to be tested
1858 * @return the Unicode allocation block that contains the code point
1861 public static UnicodeBlock of(int ch)
1863 if (ch > MAX_VALUE) {
1864 return INVALID_CODE;
1867 return UnicodeBlock.getInstance((PROPERTY_.getAdditional(ch, 0)
1868 & BLOCK_MASK_) >> BLOCK_SHIFT_);
1872 * Internal function returning of(ch).getID().
1875 * @return numeric block value
1878 static int idOf(int ch) {
1879 if (ch < 0 || ch > MAX_VALUE) {
1883 return (PROPERTY_.getAdditional(ch, 0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
1887 * Cover the JDK 1.5 API. Return the Unicode block with the
1888 * given name. <br/><b>Note</b>: Unlike JDK 1.5, this only matches
1889 * against the official UCD name and the Java block name
1891 * @param blockName the name of the block to match
1892 * @return the UnicodeBlock with that name
1893 * @throws IllegalArgumentException if the blockName could not be matched
1896 public static final UnicodeBlock forName(String blockName) {
1899 m = (Map)mref.get();
1902 m = new HashMap(BLOCKS_.length);
1903 for (int i = 0; i < BLOCKS_.length; ++i) {
1904 UnicodeBlock b = BLOCKS_[i];
1905 String name = trimBlockName(getPropertyValueName(UProperty.BLOCK, b.getID(), UProperty.NameChoice.LONG));
1908 mref = new SoftReference(m);
1910 UnicodeBlock b = (UnicodeBlock)m.get(trimBlockName(blockName));
1912 throw new IllegalArgumentException();
1916 private static SoftReference mref;
1918 private static String trimBlockName(String name) {
1919 String upper = name.toUpperCase();
1920 StringBuffer result = new StringBuffer(upper.length());
1921 for (int i = 0; i < upper.length(); i++) {
1922 char c = upper.charAt(i);
1923 if (c != ' ' && c != '_' && c != '-') {
1927 return result.toString();
1931 * Returns the type ID of this Unicode block
1932 * @return integer type ID of this Unicode block
1940 // private data members ---------------------------------------------
1943 * Array of UnicodeBlocks, for easy access in getInstance(int)
1945 private final static UnicodeBlock BLOCKS_[] = {
1946 NO_BLOCK, BASIC_LATIN,
1947 LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A,
1948 LATIN_EXTENDED_B, IPA_EXTENSIONS,
1949 SPACING_MODIFIER_LETTERS, COMBINING_DIACRITICAL_MARKS,
1961 HANGUL_JAMO, ETHIOPIC,
1962 CHEROKEE, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
1965 LATIN_EXTENDED_ADDITIONAL, GREEK_EXTENDED,
1966 GENERAL_PUNCTUATION, SUPERSCRIPTS_AND_SUBSCRIPTS,
1967 CURRENCY_SYMBOLS, COMBINING_MARKS_FOR_SYMBOLS,
1968 LETTERLIKE_SYMBOLS, NUMBER_FORMS,
1969 ARROWS, MATHEMATICAL_OPERATORS,
1970 MISCELLANEOUS_TECHNICAL, CONTROL_PICTURES,
1971 OPTICAL_CHARACTER_RECOGNITION, ENCLOSED_ALPHANUMERICS,
1972 BOX_DRAWING, BLOCK_ELEMENTS,
1973 GEOMETRIC_SHAPES, MISCELLANEOUS_SYMBOLS,
1974 DINGBATS, BRAILLE_PATTERNS,
1975 CJK_RADICALS_SUPPLEMENT, KANGXI_RADICALS,
1976 IDEOGRAPHIC_DESCRIPTION_CHARACTERS, CJK_SYMBOLS_AND_PUNCTUATION,
1978 BOPOMOFO, HANGUL_COMPATIBILITY_JAMO,
1979 KANBUN, BOPOMOFO_EXTENDED,
1980 ENCLOSED_CJK_LETTERS_AND_MONTHS, CJK_COMPATIBILITY,
1981 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, CJK_UNIFIED_IDEOGRAPHS,
1982 YI_SYLLABLES, YI_RADICALS,
1983 HANGUL_SYLLABLES, HIGH_SURROGATES,
1984 HIGH_PRIVATE_USE_SURROGATES, LOW_SURROGATES,
1985 PRIVATE_USE_AREA, CJK_COMPATIBILITY_IDEOGRAPHS,
1986 ALPHABETIC_PRESENTATION_FORMS, ARABIC_PRESENTATION_FORMS_A,
1987 COMBINING_HALF_MARKS, CJK_COMPATIBILITY_FORMS,
1988 SMALL_FORM_VARIANTS, ARABIC_PRESENTATION_FORMS_B,
1989 SPECIALS, HALFWIDTH_AND_FULLWIDTH_FORMS,
1991 DESERET, BYZANTINE_MUSICAL_SYMBOLS,
1992 MUSICAL_SYMBOLS, MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
1993 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
1994 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
1995 TAGS, CYRILLIC_SUPPLEMENT,
1998 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, SUPPLEMENTAL_ARROWS_A,
1999 SUPPLEMENTAL_ARROWS_B, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
2000 SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
2001 KATAKANA_PHONETIC_EXTENSIONS,
2002 VARIATION_SELECTORS, SUPPLEMENTARY_PRIVATE_USE_AREA_A,
2003 SUPPLEMENTARY_PRIVATE_USE_AREA_B,
2004 LIMBU, TAI_LE, KHMER_SYMBOLS, PHONETIC_EXTENSIONS,
2005 MISCELLANEOUS_SYMBOLS_AND_ARROWS, YIJING_HEXAGRAM_SYMBOLS,
2006 LINEAR_B_SYLLABARY, LINEAR_B_IDEOGRAMS, AEGEAN_NUMBERS,
2007 UGARITIC, SHAVIAN, OSMANYA, CYPRIOT_SYLLABARY,
2008 TAI_XUAN_JING_SYMBOLS, VARIATION_SELECTORS_SUPPLEMENT,
2010 /* New blocks in Unicode 4.1 */
2011 ANCIENT_GREEK_MUSICAL_NOTATION,
2012 ANCIENT_GREEK_NUMBERS,
2016 COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
2019 ETHIOPIC_SUPPLEMENT,
2020 GEORGIAN_SUPPLEMENT,
2023 MODIFIER_TONE_LETTERS,
2026 PHONETIC_EXTENSIONS_SUPPLEMENT,
2027 SUPPLEMENTAL_PUNCTUATION,
2038 CUNEIFORM_NUMBERS_AND_PUNCTUATION,
2039 COUNTING_ROD_NUMERALS,
2041 /* New blocks in Unicode 5.8 */
2045 CYRILLIC_EXTENDED_A,
2047 CYRILLIC_EXTENDED_B,
2062 if (COUNT!=BLOCKS_.length) {
2063 throw new java.lang.IllegalStateException("UnicodeBlock fields are inconsistent!");
2067 * Identification code for this UnicodeBlock
2071 // private constructor ----------------------------------------------
2074 * UnicodeBlock constructor
2075 * @param name name of this UnicodeBlock
2076 * @param id unique id of this UnicodeBlock
2077 * @exception NullPointerException if name is <code>null</code>
2079 private UnicodeBlock(String name, int id)
2087 * East Asian Width constants.
2088 * @see UProperty#EAST_ASIAN_WIDTH
2089 * @see UCharacter#getIntPropertyValue
2092 public static interface EastAsianWidth
2097 public static final int NEUTRAL = 0;
2101 public static final int AMBIGUOUS = 1;
2105 public static final int HALFWIDTH = 2;
2109 public static final int FULLWIDTH = 3;
2113 public static final int NARROW = 4;
2117 public static final int WIDE = 5;
2121 public static final int COUNT = 6;
2125 * Decomposition Type constants.
2126 * @see UProperty#DECOMPOSITION_TYPE
2129 public static interface DecompositionType
2134 public static final int NONE = 0;
2138 public static final int CANONICAL = 1;
2142 public static final int COMPAT = 2;
2146 public static final int CIRCLE = 3;
2150 public static final int FINAL = 4;
2154 public static final int FONT = 5;
2158 public static final int FRACTION = 6;
2162 public static final int INITIAL = 7;
2166 public static final int ISOLATED = 8;
2170 public static final int MEDIAL = 9;
2174 public static final int NARROW = 10;
2178 public static final int NOBREAK = 11;
2182 public static final int SMALL = 12;
2186 public static final int SQUARE = 13;
2190 public static final int SUB = 14;
2194 public static final int SUPER = 15;
2198 public static final int VERTICAL = 16;
2202 public static final int WIDE = 17;
2206 public static final int COUNT = 18;
2210 * Joining Type constants.
2211 * @see UProperty#JOINING_TYPE
2214 public static interface JoiningType
2219 public static final int NON_JOINING = 0;
2223 public static final int JOIN_CAUSING = 1;
2227 public static final int DUAL_JOINING = 2;
2231 public static final int LEFT_JOINING = 3;
2235 public static final int RIGHT_JOINING = 4;
2239 public static final int TRANSPARENT = 5;
2243 public static final int COUNT = 6;
2247 * Joining Group constants.
2248 * @see UProperty#JOINING_GROUP
2251 public static interface JoiningGroup
2256 public static final int NO_JOINING_GROUP = 0;
2260 public static final int AIN = 1;
2264 public static final int ALAPH = 2;
2268 public static final int ALEF = 3;
2272 public static final int BEH = 4;
2276 public static final int BETH = 5;
2280 public static final int DAL = 6;
2284 public static final int DALATH_RISH = 7;
2288 public static final int E = 8;
2292 public static final int FEH = 9;
2296 public static final int FINAL_SEMKATH = 10;
2300 public static final int GAF = 11;
2304 public static final int GAMAL = 12;
2308 public static final int HAH = 13;
2312 public static final int HAMZA_ON_HEH_GOAL = 14;
2316 public static final int HE = 15;
2320 public static final int HEH = 16;
2324 public static final int HEH_GOAL = 17;
2328 public static final int HETH = 18;
2332 public static final int KAF = 19;
2336 public static final int KAPH = 20;
2340 public static final int KNOTTED_HEH = 21;
2344 public static final int LAM = 22;
2348 public static final int LAMADH = 23;
2352 public static final int MEEM = 24;
2356 public static final int MIM = 25;
2360 public static final int NOON = 26;
2364 public static final int NUN = 27;
2368 public static final int PE = 28;
2372 public static final int QAF = 29;
2376 public static final int QAPH = 30;
2380 public static final int REH = 31;
2384 public static final int REVERSED_PE = 32;
2388 public static final int SAD = 33;
2392 public static final int SADHE = 34;
2396 public static final int SEEN = 35;
2400 public static final int SEMKATH = 36;
2404 public static final int SHIN = 37;
2408 public static final int SWASH_KAF = 38;
2412 public static final int SYRIAC_WAW = 39;
2416 public static final int TAH = 40;
2420 public static final int TAW = 41;
2424 public static final int TEH_MARBUTA = 42;
2428 public static final int TETH = 43;
2432 public static final int WAW = 44;
2436 public static final int YEH = 45;
2440 public static final int YEH_BARREE = 46;
2444 public static final int YEH_WITH_TAIL = 47;
2448 public static final int YUDH = 48;
2452 public static final int YUDH_HE = 49;
2456 public static final int ZAIN = 50;
2460 public static final int FE = 51;
2464 public static final int KHAPH = 52;
2468 public static final int ZHAIN = 53;
2472 public static final int BURUSHASKI_YEH_BARREE = 54;
2476 public static final int COUNT = 55;
2480 * Grapheme Cluster Break constants.
2481 * @see UProperty#GRAPHEME_CLUSTER_BREAK
2484 public static interface GraphemeClusterBreak {
2488 public static final int OTHER = 0;
2492 public static final int CONTROL = 1;
2496 public static final int CR = 2;
2500 public static final int EXTEND = 3;
2504 public static final int L = 4;
2508 public static final int LF = 5;
2512 public static final int LV = 6;
2516 public static final int LVT = 7;
2520 public static final int T = 8;
2524 public static final int V = 9;
2528 public static final int SPACING_MARK = 10;
2532 public static final int PREPEND = 11;
2536 public static final int COUNT = 12;
2540 * Word Break constants.
2541 * @see UProperty#WORD_BREAK
2544 public static interface WordBreak {
2548 public static final int OTHER = 0;
2552 public static final int ALETTER = 1;
2556 public static final int FORMAT = 2;
2560 public static final int KATAKANA = 3;
2564 public static final int MIDLETTER = 4;
2568 public static final int MIDNUM = 5;
2572 public static final int NUMERIC = 6;
2576 public static final int EXTENDNUMLET = 7;
2580 public static final int CR = 8;
2584 public static final int EXTEND = 9;
2588 public static final int LF = 10;
2592 public static final int MIDNUMLET = 11;
2596 public static final int NEWLINE = 12;
2600 public static final int COUNT = 13;
2604 * Sentence Break constants.
2605 * @see UProperty#SENTENCE_BREAK
2608 public static interface SentenceBreak {
2612 public static final int OTHER = 0;
2616 public static final int ATERM = 1;
2620 public static final int CLOSE = 2;
2624 public static final int FORMAT = 3;
2628 public static final int LOWER = 4;
2632 public static final int NUMERIC = 5;
2636 public static final int OLETTER = 6;
2640 public static final int SEP = 7;
2644 public static final int SP = 8;
2648 public static final int STERM = 9;
2652 public static final int UPPER = 10;
2656 public static final int CR = 11;
2660 public static final int EXTEND = 12;
2664 public static final int LF = 13;
2668 public static final int SCONTINUE = 14;
2672 public static final int COUNT = 15;
2676 * Line Break constants.
2677 * @see UProperty#LINE_BREAK
2680 public static interface LineBreak
2685 public static final int UNKNOWN = 0;
2689 public static final int AMBIGUOUS = 1;
2693 public static final int ALPHABETIC = 2;
2697 public static final int BREAK_BOTH = 3;
2701 public static final int BREAK_AFTER = 4;
2705 public static final int BREAK_BEFORE = 5;
2709 public static final int MANDATORY_BREAK = 6;
2713 public static final int CONTINGENT_BREAK = 7;
2717 public static final int CLOSE_PUNCTUATION = 8;
2721 public static final int COMBINING_MARK = 9;
2725 public static final int CARRIAGE_RETURN = 10;
2729 public static final int EXCLAMATION = 11;
2733 public static final int GLUE = 12;
2737 public static final int HYPHEN = 13;
2741 public static final int IDEOGRAPHIC = 14;
2746 public static final int INSEPERABLE = 15;
2748 * Renamed from the misspelled "inseperable" in Unicode 4.0.1.
2751 public static final int INSEPARABLE = 15;
2755 public static final int INFIX_NUMERIC = 16;
2759 public static final int LINE_FEED = 17;
2763 public static final int NONSTARTER = 18;
2767 public static final int NUMERIC = 19;
2771 public static final int OPEN_PUNCTUATION = 20;
2775 public static final int POSTFIX_NUMERIC = 21;
2779 public static final int PREFIX_NUMERIC = 22;
2783 public static final int QUOTATION = 23;
2787 public static final int COMPLEX_CONTEXT = 24;
2791 public static final int SURROGATE = 25;
2795 public static final int SPACE = 26;
2799 public static final int BREAK_SYMBOLS = 27;
2803 public static final int ZWSPACE = 28;
2808 public static final int NEXT_LINE = 29; /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */
2813 public static final int WORD_JOINER = 30; /*[WJ]*/
2815 /* from here on: new in Unicode 4.1/ICU 3.4 */
2820 public static final int H2 = 31;
2824 public static final int H3 = 32;
2828 public static final int JL = 33;
2832 public static final int JT = 34;
2836 public static final int JV = 35;
2841 public static final int COUNT = 36;
2845 * Numeric Type constants.
2846 * @see UProperty#NUMERIC_TYPE
2849 public static interface NumericType
2854 public static final int NONE = 0;
2858 public static final int DECIMAL = 1;
2862 public static final int DIGIT = 2;
2866 public static final int NUMERIC = 3;
2870 public static final int COUNT = 4;
2874 * Hangul Syllable Type constants.
2876 * @see UProperty#HANGUL_SYLLABLE_TYPE
2879 public static interface HangulSyllableType
2884 public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
2888 public static final int LEADING_JAMO = 1; /*[L]*/
2892 public static final int VOWEL_JAMO = 2; /*[V]*/
2896 public static final int TRAILING_JAMO = 3; /*[T]*/
2900 public static final int LV_SYLLABLE = 4; /*[LV]*/
2904 public static final int LVT_SYLLABLE = 5; /*[LVT]*/
2908 public static final int COUNT = 6;
2911 // public data members -----------------------------------------------
2914 * The lowest Unicode code point value.
2917 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
2920 * The highest Unicode code point value (scalar value) according to the
2922 * This is a 21-bit value (21 bits, rounded up).<br>
2923 * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
2926 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
2929 * The minimum value for Supplementary code points
2932 public static final int SUPPLEMENTARY_MIN_VALUE =
2933 UTF16.SUPPLEMENTARY_MIN_VALUE;
2936 * Unicode value used when translating into Unicode encoding form and there
2937 * is no existing character.
2940 public static final int REPLACEMENT_CHAR = '\uFFFD';
2943 * Special value that is returned by getUnicodeNumericValue(int) when no
2944 * numeric value is defined for a code point.
2946 * @see #getUnicodeNumericValue
2948 public static final double NO_NUMERIC_VALUE = -123456789;
2951 * Compatibility constant for Java Character's MIN_RADIX.
2954 public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
2957 * Compatibility constant for Java Character's MAX_RADIX.
2960 public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
2963 * Do not lowercase non-initial parts of words when titlecasing.
2964 * Option bit for titlecasing APIs that take an options bit set.
2966 * By default, titlecasing will titlecase the first cased character
2967 * of a word and lowercase all other characters.
2968 * With this option, the other characters will not be modified.
2973 public static final int TITLECASE_NO_LOWERCASE = 0x100;
2976 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
2977 * titlecase exactly the characters at breaks from the iterator.
2978 * Option bit for titlecasing APIs that take an options bit set.
2980 * By default, titlecasing will take each break iterator index,
2981 * adjust it by looking for the next cased character, and titlecase that one.
2982 * Other characters are lowercased.
2984 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
2986 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
2987 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
2988 * cased character F. If F exists, map F to default_title(F); then map each
2989 * subsequent character C to default_lower(C).
2992 * @see #TITLECASE_NO_LOWERCASE
2995 public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
2997 // public methods ----------------------------------------------------
3000 * Retrieves the numeric value of a decimal digit code point.
3001 * <br>This method observes the semantics of
3002 * <code>java.lang.Character.digit()</code>. Note that this
3003 * will return positive values for code points for which isDigit
3004 * returns false, just like java.lang.Character.
3005 * <br><em>Semantic Change:</em> In release 1.3.1 and
3006 * prior, this did not treat the European letters as having a
3007 * digit value, and also treated numeric letters and other numbers as
3009 * This has been changed to conform to the java semantics.
3010 * <br>A code point is a valid digit if and only if:
3012 * <li>ch is a decimal digit or one of the european letters, and
3013 * <li>the value of ch is less than the specified radix.
3015 * @param ch the code point to query
3016 * @param radix the radix
3017 * @return the numeric value represented by the code point in the
3018 * specified radix, or -1 if the code point is not a decimal digit
3019 * or if its value is too large for the radix
3022 public static int digit(int ch, int radix)
3024 // when ch is out of bounds getProperty == 0
3025 int props = getProperty(ch);
3027 if (getNumericType(props) == NumericType.DECIMAL) {
3028 value = UCharacterProperty.getUnsignedValue(props);
3030 value = getEuropeanDigit(ch);
3032 return (0 <= value && value < radix) ? value : -1;
3036 * Retrieves the numeric value of a decimal digit code point.
3037 * <br>This is a convenience overload of <code>digit(int, int)</code>
3038 * that provides a decimal radix.
3039 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
3040 * treated numeric letters and other numbers as digits. This has
3041 * been changed to conform to the java semantics.
3042 * @param ch the code point to query
3043 * @return the numeric value represented by the code point,
3044 * or -1 if the code point is not a decimal digit or if its
3045 * value is too large for a decimal radix
3048 public static int digit(int ch)
3050 int props = getProperty(ch);
3051 if (getNumericType(props) == NumericType.DECIMAL) {
3052 return UCharacterProperty.getUnsignedValue(props);
3059 * Returns the numeric value of the code point as a nonnegative
3061 * <br>If the code point does not have a numeric value, then -1 is returned.
3063 * If the code point has a numeric value that cannot be represented as a
3064 * nonnegative integer (for example, a fractional value), then -2 is
3066 * @param ch the code point to query
3067 * @return the numeric value of the code point, or -1 if it has no numeric
3068 * value, or -2 if it has a numeric value that cannot be represented as a
3069 * nonnegative integer
3072 public static int getNumericValue(int ch)
3074 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
3075 int props = PROPERTY_.getProperty(ch);
3076 int numericType = getNumericType(props);
3078 if(numericType==0) {
3079 return getEuropeanDigit(ch);
3081 if(numericType==UCharacterProperty.NT_FRACTION || numericType>=UCharacterProperty.NT_COUNT) {
3085 int numericValue = UCharacterProperty.getUnsignedValue(props);
3087 if(numericType<NumericType.COUNT) {
3088 /* normal type, the value is stored directly */
3089 return numericValue;
3090 } else /* numericType==NT_LARGE */ {
3091 /* large value with exponent */
3095 mant=numericValue>>LARGE_MANT_SHIFT;
3096 exp=numericValue&LARGE_EXP_MASK;
3099 exp+=LARGE_EXP_OFFSET_EXTRA;
3101 return -2; /* reserved mantissa value */
3103 exp+=LARGE_EXP_OFFSET;
3111 /* multiply by 10^exp without math.h */
3130 if(numValue<=Integer.MAX_VALUE) {
3131 return (int)numValue;
3139 * <p>Get the numeric value for a Unicode code point as defined in the
3140 * Unicode Character Database.</p>
3141 * <p>A "double" return type is necessary because some numeric values are
3142 * fractions, negative, or too large for int.</p>
3143 * <p>For characters without any numeric values in the Unicode Character
3144 * Database, this function will return NO_NUMERIC_VALUE.</p>
3145 * <p><em>API Change:</em> In release 2.2 and prior, this API has a
3146 * return type int and returns -1 when the argument ch does not have a
3147 * corresponding numeric value. This has been changed to synch with ICU4C
3149 * This corresponds to the ICU4C function u_getNumericValue.
3150 * @param ch Code point to get the numeric value for.
3151 * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
3154 public static double getUnicodeNumericValue(int ch)
3156 // equivalent to c version double u_getNumericValue(UChar32 c)
3157 int props = PROPERTY_.getProperty(ch);
3158 int numericType = getNumericType(props);
3160 if(numericType==0 || numericType>=UCharacterProperty.NT_COUNT) {
3161 return NO_NUMERIC_VALUE;
3164 int numericValue = UCharacterProperty.getUnsignedValue(props);
3166 if(numericType<NumericType.COUNT) {
3167 /* normal type, the value is stored directly */
3168 return numericValue;
3169 } else if(numericType==UCharacterProperty.NT_FRACTION) {
3170 /* fraction value */
3171 int numerator, denominator;
3173 numerator=numericValue>>FRACTION_NUM_SHIFT;
3174 denominator=(numericValue&FRACTION_DEN_MASK)+FRACTION_DEN_OFFSET;
3179 return (double)numerator/(double)denominator;
3180 } else /* numericType==NT_LARGE */ {
3181 /* large value with exponent */
3185 mant=numericValue>>LARGE_MANT_SHIFT;
3186 exp=numericValue&LARGE_EXP_MASK;
3189 exp+=LARGE_EXP_OFFSET_EXTRA;
3191 return NO_NUMERIC_VALUE; /* reserved mantissa value */
3193 exp+=LARGE_EXP_OFFSET;
3198 /* multiply by 10^exp without math.h */
3223 * Compatibility override of Java deprecated method. This
3224 * method will always remain deprecated. Delegates to
3225 * java.lang.Character.isSpace.
3226 * @param ch the code point
3227 * @return true if the code point is a space character as
3228 * defined by java.lang.Character.isSpace.
3229 * @deprecated ICU 3.4 (Java)
3231 public static boolean isSpace(int ch) {
3232 return ch <= 0x20 &&
3233 (ch == 0x20 || ch == 0x09 || ch == 0x0a || ch == 0x0c || ch == 0x0d);
3237 * Returns a value indicating a code point's Unicode category.
3238 * Up-to-date Unicode implementation of java.lang.Character.getType()
3239 * except for the above mentioned code points that had their category
3241 * Return results are constants from the interface
3242 * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
3243 * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
3244 * those returned by java.lang.Character.getType. UCharacterCategory values
3245 * match the ones used in ICU4C, while java.lang.Character type
3246 * values, though similar, skip the value 17.</p>
3247 * @param ch code point whose type is to be determined
3248 * @return category which is a value of UCharacterCategory
3251 public static int getType(int ch)
3253 return getProperty(ch) & UCharacterProperty.TYPE_MASK;
3257 * Determines if a code point has a defined meaning in the up-to-date
3259 * E.g. supplementary code points though allocated space are not defined in
3261 * Up-to-date Unicode implementation of java.lang.Character.isDefined()
3262 * @param ch code point to be determined if it is defined in the most
3263 * current version of Unicode
3264 * @return true if this code point is defined in unicode
3267 public static boolean isDefined(int ch)
3269 return getType(ch) != 0;
3273 * Determines if a code point is a Java digit.
3274 * <br>This method observes the semantics of
3275 * <code>java.lang.Character.isDigit()</code>. It returns true for decimal
3277 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this treated
3278 * numeric letters and other numbers as digits.
3279 * This has been changed to conform to the java semantics.
3280 * @param ch code point to query
3281 * @return true if this code point is a digit
3284 public static boolean isDigit(int ch)
3286 return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
3290 * Determines if the specified code point is an ISO control character.
3291 * A code point is considered to be an ISO control character if it is in
3292 * the range \u0000 through \u001F or in the range \u007F through
3294 * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
3295 * @param ch code point to determine if it is an ISO control character
3296 * @return true if code point is a ISO control character
3299 public static boolean isISOControl(int ch)
3301 return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
3302 ((ch <= UNIT_SEPARATOR_) || (ch >= DELETE_));
3306 * Determines if the specified code point is a letter.
3307 * Up-to-date Unicode implementation of java.lang.Character.isLetter()
3308 * @param ch code point to determine if it is a letter
3309 * @return true if code point is a letter
3312 public static boolean isLetter(int ch)
3314 // if props == 0, it will just fall through and return false
3315 return ((1 << getType(ch))
3316 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3317 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3318 | (1 << UCharacterCategory.TITLECASE_LETTER)
3319 | (1 << UCharacterCategory.MODIFIER_LETTER)
3320 | (1 << UCharacterCategory.OTHER_LETTER))) != 0;
3324 * Determines if the specified code point is a letter or digit.
3325 * Note this method, unlike java.lang.Character does not regard the ascii
3326 * characters 'A' - 'Z' and 'a' - 'z' as digits.
3327 * @param ch code point to determine if it is a letter or a digit
3328 * @return true if code point is a letter or a digit
3331 public static boolean isLetterOrDigit(int ch)
3333 return ((1 << getType(ch))
3334 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3335 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3336 | (1 << UCharacterCategory.TITLECASE_LETTER)
3337 | (1 << UCharacterCategory.MODIFIER_LETTER)
3338 | (1 << UCharacterCategory.OTHER_LETTER)
3339 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER))) != 0;
3343 * Compatibility override of Java deprecated method. This
3344 * method will always remain deprecated. Delegates to
3345 * java.lang.Character.isJavaIdentifierStart.
3346 * @param cp the code point
3347 * @return true if the code point can start a java identifier.
3348 * @deprecated ICU 3.4 (Java)
3350 public static boolean isJavaLetter(int cp) {
3351 return isJavaIdentifierStart(cp);
3355 * Compatibility override of Java deprecated method. This
3356 * method will always remain deprecated. Delegates to
3357 * java.lang.Character.isJavaIdentifierPart.
3358 * @param cp the code point
3359 * @return true if the code point can continue a java identifier.
3360 * @deprecated ICU 3.4 (Java)
3362 public static boolean isJavaLetterOrDigit(int cp) {
3363 return isJavaIdentifierPart(cp);
3367 * Compatibility override of Java method, delegates to
3368 * java.lang.Character.isJavaIdentifierStart.
3369 * @param cp the code point
3370 * @return true if the code point can start a java identifier.
3373 public static boolean isJavaIdentifierStart(int cp) {
3374 // note, downcast to char for jdk 1.4 compatibility
3375 return java.lang.Character.isJavaIdentifierStart((char)cp);
3379 * Compatibility override of Java method, delegates to
3380 * java.lang.Character.isJavaIdentifierPart.
3381 * @param cp the code point
3382 * @return true if the code point can continue a java identifier.
3385 public static boolean isJavaIdentifierPart(int cp) {
3386 // note, downcast to char for jdk 1.4 compatibility
3387 return java.lang.Character.isJavaIdentifierPart((char)cp);
3391 * Determines if the specified code point is a lowercase character.
3392 * UnicodeData only contains case mappings for code points where they are
3393 * one-to-one mappings; it also omits information about context-sensitive
3394 * case mappings.<br> For more information about Unicode case mapping
3395 * please refer to the
3396 * <a href=http://www.unicode.org/unicode/reports/tr21/>Technical report
3398 * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
3399 * @param ch code point to determine if it is in lowercase
3400 * @return true if code point is a lowercase character
3403 public static boolean isLowerCase(int ch)
3405 // if props == 0, it will just fall through and return false
3406 return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
3410 * Determines if the specified code point is a white space character.
3411 * A code point is considered to be an whitespace character if and only
3412 * if it satisfies one of the following criteria:
3414 * <li> It is a Unicode space character (categories "Zs" or "Zl" or "Zp"), but is not
3415 * also a no-break space (\u00A0 or \u2007 or \u202F).
3416 * <li> It is \u0009, HORIZONTAL TABULATION.
3417 * <li> It is \u000A, LINE FEED.
3418 * <li> It is \u000B, VERTICAL TABULATION.
3419 * <li> It is \u000C, FORM FEED.
3420 * <li> It is \u000D, CARRIAGE RETURN.
3421 * <li> It is \u001C, FILE SEPARATOR.
3422 * <li> It is \u001D, GROUP SEPARATOR.
3423 * <li> It is \u001E, RECORD SEPARATOR.
3424 * <li> It is \u001F, UNIT SEPARATOR.
3427 * This API tries to synch to the semantics of the Java API,
3428 * java.lang.Character.isWhitespace(), but it may not return
3429 * the exactly same results because of the Unicode version
3431 * @param ch code point to determine if it is a white space
3432 * @return true if the specified code point is a white space character
3435 public static boolean isWhitespace(int ch)
3437 // exclude no-break spaces
3438 // if props == 0, it will just fall through and return false
3439 return ((1 << getType(ch)) &
3440 ((1 << UCharacterCategory.SPACE_SEPARATOR)
3441 | (1 << UCharacterCategory.LINE_SEPARATOR)
3442 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0
3443 && (ch != NO_BREAK_SPACE_) && (ch != FIGURE_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_)
3444 // TAB VT LF FF CR FS GS RS US NL are all control characters
3445 // that are white spaces.
3446 || (ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f);
3450 * Determines if the specified code point is a Unicode specified space
3451 * character, i.e. if code point is in the category Zs, Zl and Zp.
3452 * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
3453 * @param ch code point to determine if it is a space
3454 * @return true if the specified code point is a space character
3457 public static boolean isSpaceChar(int ch)
3459 // if props == 0, it will just fall through and return false
3460 return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
3461 | (1 << UCharacterCategory.LINE_SEPARATOR)
3462 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR)))
3467 * Determines if the specified code point is a titlecase character.
3468 * UnicodeData only contains case mappings for code points where they are
3469 * one-to-one mappings; it also omits information about context-sensitive
3470 * case mappings.<br>
3471 * For more information about Unicode case mapping please refer to the
3472 * <a href=http://www.unicode.org/unicode/reports/tr21/>
3473 * Technical report #21</a>.<br>
3474 * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
3475 * @param ch code point to determine if it is in title case
3476 * @return true if the specified code point is a titlecase character
3479 public static boolean isTitleCase(int ch)
3481 // if props == 0, it will just fall through and return false
3482 return getType(ch) == UCharacterCategory.TITLECASE_LETTER;
3486 * Determines if the specified code point may be any part of a Unicode
3487 * identifier other than the starting character.
3488 * A code point may be part of a Unicode identifier if and only if it is
3489 * one of the following:
3491 * <li> Lu Uppercase letter
3492 * <li> Ll Lowercase letter
3493 * <li> Lt Titlecase letter
3494 * <li> Lm Modifier letter
3495 * <li> Lo Other letter
3496 * <li> Nl Letter number
3497 * <li> Pc Connecting punctuation character
3498 * <li> Nd decimal number
3499 * <li> Mc Spacing combining mark
3500 * <li> Mn Non-spacing mark
3501 * <li> Cf formatting code
3503 * Up-to-date Unicode implementation of
3504 * java.lang.Character.isUnicodeIdentifierPart().<br>
3505 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3506 * @param ch code point to determine if is can be part of a Unicode
3508 * @return true if code point is any character belonging a unicode
3509 * identifier suffix after the first character
3512 public static boolean isUnicodeIdentifierPart(int ch)
3514 // if props == 0, it will just fall through and return false
3516 return ((1 << getType(ch))
3517 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3518 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3519 | (1 << UCharacterCategory.TITLECASE_LETTER)
3520 | (1 << UCharacterCategory.MODIFIER_LETTER)
3521 | (1 << UCharacterCategory.OTHER_LETTER)
3522 | (1 << UCharacterCategory.LETTER_NUMBER)
3523 | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
3524 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
3525 | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
3526 | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
3527 || isIdentifierIgnorable(ch);
3531 * Determines if the specified code point is permissible as the first
3532 * character in a Unicode identifier.
3533 * A code point may start a Unicode identifier if it is of type either
3535 * <li> Lu Uppercase letter
3536 * <li> Ll Lowercase letter
3537 * <li> Lt Titlecase letter
3538 * <li> Lm Modifier letter
3539 * <li> Lo Other letter
3540 * <li> Nl Letter number
3542 * Up-to-date Unicode implementation of
3543 * java.lang.Character.isUnicodeIdentifierStart().<br>
3544 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3545 * @param ch code point to determine if it can start a Unicode identifier
3546 * @return true if code point is the first character belonging a unicode
3550 public static boolean isUnicodeIdentifierStart(int ch)
3552 /*int cat = getType(ch);*/
3553 // if props == 0, it will just fall through and return false
3554 return ((1 << getType(ch))
3555 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
3556 | (1 << UCharacterCategory.LOWERCASE_LETTER)
3557 | (1 << UCharacterCategory.TITLECASE_LETTER)
3558 | (1 << UCharacterCategory.MODIFIER_LETTER)
3559 | (1 << UCharacterCategory.OTHER_LETTER)
3560 | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
3564 * Determines if the specified code point should be regarded as an
3565 * ignorable character in a Unicode identifier.
3566 * A character is ignorable in the Unicode standard if it is of the type
3567 * Cf, Formatting code.<br>
3568 * Up-to-date Unicode implementation of
3569 * java.lang.Character.isIdentifierIgnorable().<br>
3570 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
3571 * @param ch code point to be determined if it can be ignored in a Unicode
3573 * @return true if the code point is ignorable
3576 public static boolean isIdentifierIgnorable(int ch)
3578 // see java.lang.Character.isIdentifierIgnorable() on range of
3579 // ignorable characters.
3581 return isISOControl(ch)
3582 && !((ch >= 0x9 && ch <= 0xd)
3583 || (ch >= 0x1c && ch <= 0x1f));
3585 return getType(ch) == UCharacterCategory.FORMAT;
3589 * Determines if the specified code point is an uppercase character.
3590 * UnicodeData only contains case mappings for code point where they are
3591 * one-to-one mappings; it also omits information about context-sensitive
3592 * case mappings.<br>
3593 * For language specific case conversion behavior, use
3594 * toUpperCase(locale, str). <br>
3595 * For example, the case conversion for dot-less i and dotted I in Turkish,
3596 * or for final sigma in Greek.
3597 * For more information about Unicode case mapping please refer to the
3598 * <a href=http://www.unicode.org/unicode/reports/tr21/>
3599 * Technical report #21</a>.<br>
3600 * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
3601 * @param ch code point to determine if it is in uppercase
3602 * @return true if the code point is an uppercase character
3605 public static boolean isUpperCase(int ch)
3607 // if props == 0, it will just fall through and return false
3608 return getType(ch) == UCharacterCategory.UPPERCASE_LETTER;
3612 * The given code point is mapped to its lowercase equivalent; if the code
3613 * point has no lowercase equivalent, the code point itself is returned.
3614 * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
3616 * <p>This function only returns the simple, single-code point case mapping.
3617 * Full case mappings should be used whenever possible because they produce
3618 * better results by working on whole strings.
3619 * They take into account the string context and the language and can map
3620 * to a result string with a different length as appropriate.
3621 * Full case mappings are applied by the case mapping functions
3622 * that take String parameters rather than code points (int).
3623 * See also the User Guide chapter on C/POSIX migration:
3624 * http://www.icu-project.org/userguide/posix.html#case_mappings
3626 * @param ch code point whose lowercase equivalent is to be retrieved
3627 * @return the lowercase equivalent code point
3630 public static int toLowerCase(int ch) {
3631 return gCsp.tolower(ch);
3635 * Converts argument code point and returns a String object representing
3636 * the code point's value in UTF16 format.
3637 * The result is a string whose length is 1 for non-supplementary code
3638 * points, 2 otherwise.<br>
3639 * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
3641 * Up-to-date Unicode implementation of java.lang.Character.toString()
3642 * @param ch code point
3643 * @return string representation of the code point, null if code point is not
3644 * defined in unicode
3647 public static String toString(int ch)
3649 if (ch < MIN_VALUE || ch > MAX_VALUE) {
3653 if (ch < SUPPLEMENTARY_MIN_VALUE) {
3654 return String.valueOf((char)ch);
3657 StringBuffer result = new StringBuffer();
3658 result.append(UTF16.getLeadSurrogate(ch));
3659 result.append(UTF16.getTrailSurrogate(ch));
3660 return result.toString();
3664 * Converts the code point argument to titlecase.
3665 * If no titlecase is available, the uppercase is returned. If no uppercase
3666 * is available, the code point itself is returned.
3667 * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
3669 * <p>This function only returns the simple, single-code point case mapping.
3670 * Full case mappings should be used whenever possible because they produce
3671 * better results by working on whole strings.
3672 * They take into account the string context and the language and can map
3673 * to a result string with a different length as appropriate.
3674 * Full case mappings are applied by the case mapping functions
3675 * that take String parameters rather than code points (int).
3676 * See also the User Guide chapter on C/POSIX migration:
3677 * http://www.icu-project.org/userguide/posix.html#case_mappings
3679 * @param ch code point whose title case is to be retrieved
3680 * @return titlecase code point
3683 public static int toTitleCase(int ch) {
3684 return gCsp.totitle(ch);
3688 * Converts the character argument to uppercase.
3689 * If no uppercase is available, the character itself is returned.
3690 * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
3692 * <p>This function only returns the simple, single-code point case mapping.
3693 * Full case mappings should be used whenever possible because they produce
3694 * better results by working on whole strings.
3695 * They take into account the string context and the language and can map
3696 * to a result string with a different length as appropriate.
3697 * Full case mappings are applied by the case mapping functions
3698 * that take String parameters rather than code points (int).
3699 * See also the User Guide chapter on C/POSIX migration:
3700 * http://www.icu-project.org/userguide/posix.html#case_mappings
3702 * @param ch code point whose uppercase is to be retrieved
3703 * @return uppercase code point
3706 public static int toUpperCase(int ch) {
3707 return gCsp.toupper(ch);
3710 // extra methods not in java.lang.Character --------------------------
3713 * Determines if the code point is a supplementary character.
3714 * A code point is a supplementary character if and only if it is greater
3715 * than <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
3716 * @param ch code point to be determined if it is in the supplementary
3718 * @return true if code point is a supplementary character
3721 public static boolean isSupplementary(int ch)
3723 return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
3724 ch <= UCharacter.MAX_VALUE;
3728 * Determines if the code point is in the BMP plane.
3729 * @param ch code point to be determined if it is not a supplementary
3731 * @return true if code point is not a supplementary character
3734 public static boolean isBMP(int ch)
3736 return (ch >= 0 && ch <= LAST_CHAR_MASK_);
3740 * Determines whether the specified code point is a printable character
3741 * according to the Unicode standard.
3742 * @param ch code point to be determined if it is printable
3743 * @return true if the code point is a printable character
3746 public static boolean isPrintable(int ch)
3748 int cat = getType(ch);
3749 // if props == 0, it will just fall through and return false
3750 return (cat != UCharacterCategory.UNASSIGNED &&
3751 cat != UCharacterCategory.CONTROL &&
3752 cat != UCharacterCategory.FORMAT &&
3753 cat != UCharacterCategory.PRIVATE_USE &&
3754 cat != UCharacterCategory.SURROGATE &&
3755 cat != UCharacterCategory.GENERAL_OTHER_TYPES);
3759 * Determines whether the specified code point is of base form.
3760 * A code point of base form does not graphically combine with preceding
3761 * characters, and is neither a control nor a format character.
3762 * @param ch code point to be determined if it is of base form
3763 * @return true if the code point is of base form
3766 public static boolean isBaseForm(int ch)
3768 int cat = getType(ch);
3769 // if props == 0, it will just fall through and return false
3770 return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
3771 cat == UCharacterCategory.OTHER_NUMBER ||
3772 cat == UCharacterCategory.LETTER_NUMBER ||
3773 cat == UCharacterCategory.UPPERCASE_LETTER ||
3774 cat == UCharacterCategory.LOWERCASE_LETTER ||
3775 cat == UCharacterCategory.TITLECASE_LETTER ||
3776 cat == UCharacterCategory.MODIFIER_LETTER ||
3777 cat == UCharacterCategory.OTHER_LETTER ||
3778 cat == UCharacterCategory.NON_SPACING_MARK ||
3779 cat == UCharacterCategory.ENCLOSING_MARK ||
3780 cat == UCharacterCategory.COMBINING_SPACING_MARK;
3784 * Returns the Bidirection property of a code point.
3785 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
3787 * Result returned belongs to the interface
3788 * <a href=UCharacterDirection.html>UCharacterDirection</a>
3789 * @param ch the code point to be determined its direction
3790 * @return direction constant from UCharacterDirection.
3793 public static int getDirection(int ch)
3795 return gBdp.getClass(ch);
3799 * Determines whether the code point has the "mirrored" property.
3800 * This property is set for characters that are commonly used in
3801 * Right-To-Left contexts and need to be displayed with a "mirrored"
3803 * @param ch code point whose mirror is to be determined
3804 * @return true if the code point has the "mirrored" property
3807 public static boolean isMirrored(int ch)
3809 return gBdp.isMirrored(ch);
3813 * Maps the specified code point to a "mirror-image" code point.
3814 * For code points with the "mirrored" property, implementations sometimes
3815 * need a "poor man's" mapping to another code point such that the default
3816 * glyph may serve as the mirror-image of the default glyph of the
3817 * specified code point.<br>
3818 * This is useful for text conversion to and from codepages with visual
3819 * order, and for displays without glyph selection capabilities.
3820 * @param ch code point whose mirror is to be retrieved
3821 * @return another code point that may serve as a mirror-image substitute,
3822 * or ch itself if there is no such mapping or ch does not have the
3823 * "mirrored" property
3826 public static int getMirror(int ch)
3828 return gBdp.getMirror(ch);
3832 * Gets the combining class of the argument codepoint
3833 * @param ch code point whose combining is to be retrieved
3834 * @return the combining class of the codepoint
3837 public static int getCombiningClass(int ch)
3839 if (ch < MIN_VALUE || ch > MAX_VALUE) {
3840 throw new IllegalArgumentException("Codepoint out of bounds");
3842 return NormalizerImpl.getCombiningClass(ch);
3846 * A code point is illegal if and only if
3848 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3849 * <li> A surrogate value, 0xD800 to 0xDFFF
3850 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3852 * Note: legal does not mean that it is assigned in this version of Unicode.
3853 * @param ch code point to determine if it is a legal code point by itself
3854 * @return true if and only if legal.
3857 public static boolean isLegal(int ch)
3859 if (ch < MIN_VALUE) {
3862 if (ch < UTF16.SURROGATE_MIN_VALUE) {
3865 if (ch <= UTF16.SURROGATE_MAX_VALUE) {
3868 if (UCharacterUtility.isNonCharacter(ch)) {
3871 return (ch <= MAX_VALUE);
3875 * A string is legal iff all its code points are legal.
3876 * A code point is illegal if and only if
3878 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
3879 * <li> A surrogate value, 0xD800 to 0xDFFF
3880 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
3882 * Note: legal does not mean that it is assigned in this version of Unicode.
3883 * @param str containing code points to examin
3884 * @return true if and only if legal.
3887 public static boolean isLegal(String str)
3889 int size = str.length();
3891 for (int i = 0; i < size; i ++)
3893 codepoint = UTF16.charAt(str, i);
3894 if (!isLegal(codepoint)) {
3897 if (isSupplementary(codepoint)) {
3905 * Gets the version of Unicode data used.
3906 * @return the unicode version number used
3909 public static VersionInfo getUnicodeVersion()
3911 return PROPERTY_.m_unicodeVersion_;
3915 * Retrieve the most current Unicode name of the argument code point, or
3916 * null if the character is unassigned or outside the range
3917 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
3919 * Note calling any methods related to code point names, e.g. get*Name*()
3920 * incurs a one-time initialisation cost to construct the name tables.
3921 * @param ch the code point for which to get the name
3922 * @return most current Unicode name
3925 public static String getName(int ch)
3928 throw new MissingResourceException("Could not load unames.icu","","");
3930 return NAME_.getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
3934 * Gets the names for each of the characters in a string
3935 * @param s string to format
3936 * @param separator string to go between names
3937 * @return string of names
3940 public static String getName(String s, String separator) {
3941 if (s.length() == 1) { // handle common case
3942 return getName(s.charAt(0));
3945 StringBuffer sb = new StringBuffer();
3946 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
3947 cp = UTF16.charAt(s,i);
3948 if (i != 0) sb.append(separator);
3949 sb.append(UCharacter.getName(cp));
3951 return sb.toString();
3955 * Retrieve the earlier version 1.0 Unicode name of the argument code
3956 * point, or null if the character is unassigned or outside the range
3957 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
3959 * Note calling any methods related to code point names, e.g. get*Name*()
3960 * incurs a one-time initialisation cost to construct the name tables.
3961 * @param ch the code point for which to get the name
3962 * @return version 1.0 Unicode name
3965 public static String getName1_0(int ch)
3968 throw new MissingResourceException("Could not load unames.icu","","");
3970 return NAME_.getName(ch,
3971 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
3975 * <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
3976 * getName1_0(int), this method will return a name even for codepoints that
3977 * are not assigned a name in UnicodeData.txt.
3979 * The names are returned in the following order.
3981 * <li> Most current Unicode name if there is any
3982 * <li> Unicode 1.0 name if there is any
3983 * <li> Extended name in the form of
3984 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-fffe>
3986 * Note calling any methods related to code point names, e.g. get*Name*()
3987 * incurs a one-time initialisation cost to construct the name tables.
3988 * @param ch the code point for which to get the name
3989 * @return a name for the argument codepoint
3992 public static String getExtendedName(int ch)
3995 throw new MissingResourceException("Could not load unames.icu","","");
3997 return NAME_.getName(ch, UCharacterNameChoice.EXTENDED_CHAR_NAME);
4001 * Get the ISO 10646 comment for a character.
4002 * The ISO 10646 comment is an informative field in the Unicode Character
4003 * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list.
4004 * @param ch The code point for which to get the ISO comment.
4005 * It must be <code>0<=c<=0x10ffff</code>.
4006 * @return The ISO comment, or null if there is no comment for this
4010 public static String getISOComment(int ch)
4012 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
4016 throw new MissingResourceException("Could not load unames.icu","","");
4018 String result = NAME_.getGroupName(ch,
4019 UCharacterNameChoice.ISO_COMMENT_);
4024 * <p>Find a Unicode code point by its most current Unicode name and
4025 * return its code point value. All Unicode names are in uppercase.</p>
4026 * Note calling any methods related to code point names, e.g. get*Name*()
4027 * incurs a one-time initialisation cost to construct the name tables.
4028 * @param name most current Unicode character name whose code point is to
4030 * @return code point or -1 if name is not found
4033 public static int getCharFromName(String name)
4036 throw new MissingResourceException("Could not load unames.icu","","");
4038 return NAME_.getCharFromName(
4039 UCharacterNameChoice.UNICODE_CHAR_NAME, name);
4043 * <p>Find a Unicode character by its version 1.0 Unicode name and return
4044 * its code point value. All Unicode names are in uppercase.</p>
4045 * Note calling any methods related to code point names, e.g. get*Name*()
4046 * incurs a one-time initialisation cost to construct the name tables.
4047 * @param name Unicode 1.0 code point name whose code point is to
4049 * @return code point or -1 if name is not found
4052 public static int getCharFromName1_0(String name)
4055 throw new MissingResourceException("Could not load unames.icu","","");
4057 return NAME_.getCharFromName(
4058 UCharacterNameChoice.UNICODE_10_CHAR_NAME, name);
4062 * <p>Find a Unicode character by either its name and return its code
4063 * point value. All Unicode names are in uppercase.
4064 * Extended names are all lowercase except for numbers and are contained
4065 * within angle brackets.</p>
4066 * The names are searched in the following order
4068 * <li> Most current Unicode name if there is any
4069 * <li> Unicode 1.0 name if there is any
4070 * <li> Extended name in the form of
4071 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-FFFE>
4073 * Note calling any methods related to code point names, e.g. get*Name*()
4074 * incurs a one-time initialisation cost to construct the name tables.
4075 * @param name codepoint name
4076 * @return code point associated with the name or -1 if the name is not
4080 public static int getCharFromExtendedName(String name)
4083 throw new MissingResourceException("Could not load unames.icu","","");
4085 return NAME_.getCharFromName(
4086 UCharacterNameChoice.EXTENDED_CHAR_NAME, name);
4090 * Return the Unicode name for a given property, as given in the
4091 * Unicode database file PropertyAliases.txt. Most properties
4092 * have more than one name. The nameChoice determines which one
4095 * In addition, this function maps the property
4096 * UProperty.GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
4097 * "General_Category_Mask". These names are not in
4098 * PropertyAliases.txt.
4100 * @param property UProperty selector.
4102 * @param nameChoice UProperty.NameChoice selector for which name
4103 * to get. All properties have a long name. Most have a short
4104 * name, but some do not. Unicode allows for additional names; if
4105 * present these will be returned by UProperty.NameChoice.LONG + i,
4108 * @return a name, or null if Unicode explicitly defines no name
4109 * ("n/a") for a given property/nameChoice. If a given nameChoice
4110 * throws an exception, then all larger values of nameChoice will
4111 * throw an exception. If null is returned for a given
4112 * nameChoice, then other nameChoice values may return non-null
4115 * @exception IllegalArgumentException thrown if property or
4116 * nameChoice are invalid.
4119 * @see UProperty.NameChoice
4122 public static String getPropertyName(int property,
4124 return PNAMES_.getPropertyName(property, nameChoice);
4128 * Return the UProperty selector for a given property name, as
4129 * specified in the Unicode database file PropertyAliases.txt.
4130 * Short, long, and any other variants are recognized.
4132 * In addition, this function maps the synthetic names "gcm" /
4133 * "General_Category_Mask" to the property
4134 * UProperty.GENERAL_CATEGORY_MASK. These names are not in
4135 * PropertyAliases.txt.
4137 * @param propertyAlias the property name to be matched. The name
4138 * is compared using "loose matching" as described in
4139 * PropertyAliases.txt.
4141 * @return a UProperty enum.
4143 * @exception IllegalArgumentException thrown if propertyAlias
4144 * is not recognized.
4149 public static int getPropertyEnum(String propertyAlias) {
4150 return PNAMES_.getPropertyEnum(propertyAlias);
4154 * Return the Unicode name for a given property value, as given in
4155 * the Unicode database file PropertyValueAliases.txt. Most
4156 * values have more than one name. The nameChoice determines
4157 * which one is returned.
4159 * Note: Some of the names in PropertyValueAliases.txt can only be
4160 * retrieved using UProperty.GENERAL_CATEGORY_MASK, not
4161 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
4162 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
4163 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
4165 * @param property UProperty selector constant.
4166 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
4167 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
4168 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
4169 * If out of range, null is returned.
4171 * @param value selector for a value for the given property. In
4172 * general, valid values range from 0 up to some maximum. There
4173 * are a few exceptions: (1.) UProperty.BLOCK values begin at the
4174 * non-zero value BASIC_LATIN.getID(). (2.)
4175 * UProperty.CANONICAL_COMBINING_CLASS values are not contiguous
4176 * and range from 0..240. (3.) UProperty.GENERAL_CATEGORY_MASK values
4177 * are mask values produced by left-shifting 1 by
4178 * UCharacter.getType(). This allows grouped categories such as
4179 * [:L:] to be represented. Mask values are non-contiguous.
4181 * @param nameChoice UProperty.NameChoice selector for which name
4182 * to get. All values have a long name. Most have a short name,
4183 * but some do not. Unicode allows for additional names; if
4184 * present these will be returned by UProperty.NameChoice.LONG + i,
4187 * @return a name, or null if Unicode explicitly defines no name
4188 * ("n/a") for a given property/value/nameChoice. If a given
4189 * nameChoice throws an exception, then all larger values of
4190 * nameChoice will throw an exception. If null is returned for a
4191 * given nameChoice, then other nameChoice values may return
4194 * @exception IllegalArgumentException thrown if property, value,
4195 * or nameChoice are invalid.
4198 * @see UProperty.NameChoice
4201 public static String getPropertyValueName(int property,
4205 if ((property == UProperty.CANONICAL_COMBINING_CLASS
4206 || property == UProperty.LEAD_CANONICAL_COMBINING_CLASS
4207 || property == UProperty.TRAIL_CANONICAL_COMBINING_CLASS)
4208 && value >= UCharacter.getIntPropertyMinValue(
4209 UProperty.CANONICAL_COMBINING_CLASS)
4210 && value <= UCharacter.getIntPropertyMaxValue(
4211 UProperty.CANONICAL_COMBINING_CLASS)
4212 && nameChoice >= 0 && nameChoice < UProperty.NameChoice.COUNT) {
4213 // this is hard coded for the valid cc
4214 // because PropertyValueAliases.txt does not contain all of them
4216 return PNAMES_.getPropertyValueName(property, value,
4219 catch (IllegalArgumentException e) {
4223 return PNAMES_.getPropertyValueName(property, value, nameChoice);
4227 * Return the property value integer for a given value name, as
4228 * specified in the Unicode database file PropertyValueAliases.txt.
4229 * Short, long, and any other variants are recognized.
4231 * Note: Some of the names in PropertyValueAliases.txt will only be
4232 * recognized with UProperty.GENERAL_CATEGORY_MASK, not
4233 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
4234 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
4235 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
4237 * @param property UProperty selector constant.
4238 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
4239 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
4240 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
4241 * Only these properties can be enumerated.
4243 * @param valueAlias the value name to be matched. The name is
4244 * compared using "loose matching" as described in
4245 * PropertyValueAliases.txt.
4247 * @return a value integer. Note: UProperty.GENERAL_CATEGORY
4248 * values are mask values produced by left-shifting 1 by
4249 * UCharacter.getType(). This allows grouped categories such as
4250 * [:L:] to be represented.
4253 * @throws IllegalArgumentException if property is not a valid UProperty
4257 public static int getPropertyValueEnum(int property, String valueAlias) {
4258 return PNAMES_.getPropertyValueEnum(property, valueAlias);
4262 * Returns a code point corresponding to the two UTF16 characters.
4263 * @param lead the lead char
4264 * @param trail the trail char
4265 * @return code point if surrogate characters are valid.
4266 * @exception IllegalArgumentException thrown when argument characters do
4267 * not form a valid codepoint
4270 public static int getCodePoint(char lead, char trail)
4272 if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
4273 return UCharacterProperty.getRawSupplementary(lead, trail);
4275 throw new IllegalArgumentException("Illegal surrogate characters");
4279 * Returns the code point corresponding to the UTF16 character.
4280 * @param char16 the UTF16 character
4281 * @return code point if argument is a valid character.
4282 * @exception IllegalArgumentException thrown when char16 is not a valid
4286 public static int getCodePoint(char char16)
4288 if (UCharacter.isLegal(char16)) {
4291 throw new IllegalArgumentException("Illegal codepoint");
4295 * Implementation of UCaseProps.ContextIterator, iterates over a String.
4296 * See ustrcase.c/utf16_caseContextIterator().
4298 private static class StringContextIterator implements UCaseProps.ContextIterator {
4301 * @param s String to iterate over.
4303 StringContextIterator(String s) {
4306 cpStart=cpLimit=index=0;
4311 * Set the iteration limit for nextCaseMapCP() to an index within the string.
4312 * If the limit parameter is negative or past the string, then the
4313 * string length is restored as the iteration limit.
4315 * This limit does not affect the next() function which always
4316 * iterates to the very end of the string.
4318 * @param lim The iteration limit.
4320 public void setLimit(int lim) {
4321 if(0<=lim && lim<=s.length()) {
4329 * Move to the iteration limit without fetching code points up to there.
4331 public void moveToLimit() {
4332 cpStart=cpLimit=limit;
4336 * Iterate forward through the string to fetch the next code point
4337 * to be case-mapped, and set the context indexes for it.
4338 * Performance optimization, to save on function calls and redundant
4339 * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
4341 * When the iteration limit is reached (and -1 is returned),
4342 * getCPStart() will be at the iteration limit.
4344 * Iteration with next() does not affect the position for nextCaseMapCP().
4346 * @return The next code point to be case-mapped, or <0 when the iteration is done.
4348 public int nextCaseMapCP() {
4351 int c=s.charAt(cpLimit++);
4352 if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
4354 if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
4355 UTF16.TRAIL_SURROGATE_MIN_VALUE<=(c2=s.charAt(cpLimit)) && c2<=UTF16.TRAIL_SURROGATE_MAX_VALUE
4357 // supplementary code point
4359 c=UCharacterProperty.getRawSupplementary((char)c, c2);
4360 // else unpaired surrogate code point
4362 // else BMP code point
4371 * Get the start of the code point that was last returned
4372 * by nextCaseMapCP().
4374 public int getCPStart() {
4379 * Get the limit of the code point that was last returned
4380 * by nextCaseMapCP().
4382 public int getCPLimit() {
4386 // implement UCaseProps.ContextIterator
4387 public void reset(int direction) {
4389 /* reset for forward iteration */
4392 } else if(direction<0) {
4393 /* reset for backward iteration */
4397 // not a valid direction
4406 if(dir>0 && index<s.length()) {
4407 c=UTF16.charAt(s, index);
4408 index+=UTF16.getCharCount(c);
4410 } else if(dir<0 && index>0) {
4411 c=UTF16.charAt(s, index-1);
4412 index-=UTF16.getCharCount(c);
4420 protected int index, limit, cpStart, cpLimit;
4421 protected int dir; // 0=initial state >0=forward <0=backward
4425 * Gets uppercase version of the argument string.
4426 * Casing is dependent on the default locale and context-sensitive.
4427 * @param str source string to be performed on
4428 * @return uppercase version of the argument string
4431 public static String toUpperCase(String str)
4433 return toUpperCase(ULocale.getDefault(), str);
4437 * Gets lowercase version of the argument string.
4438 * Casing is dependent on the default locale and context-sensitive
4439 * @param str source string to be performed on
4440 * @return lowercase version of the argument string
4443 public static String toLowerCase(String str)
4445 return toLowerCase(ULocale.getDefault(), str);
4449 * <p>Gets the titlecase version of the argument string.</p>
4450 * <p>Position for titlecasing is determined by the argument break
4451 * iterator, hence the user can customize his break iterator for
4452 * a specialized titlecasing. In this case only the forward iteration
4453 * needs to be implemented.
4454 * If the break iterator passed in is null, the default Unicode algorithm
4455 * will be used to determine the titlecase positions.
4457 * <p>Only positions returned by the break iterator will be title cased,
4458 * character in between the positions will all be in lower case.</p>
4459 * <p>Casing is dependent on the default locale and context-sensitive</p>
4460 * @param str source string to be performed on
4461 * @param breakiter break iterator to determine the positions in which
4462 * the character should be title cased.
4463 * @return lowercase version of the argument string
4466 public static String toTitleCase(String str, BreakIterator breakiter)
4468 return toTitleCase(ULocale.getDefault(), str, breakiter);
4472 * Gets uppercase version of the argument string.
4473 * Casing is dependent on the argument locale and context-sensitive.
4474 * @param locale which string is to be converted in
4475 * @param str source string to be performed on
4476 * @return uppercase version of the argument string
4479 public static String toUpperCase(Locale locale, String str)
4481 return toUpperCase(ULocale.forLocale(locale), str);
4485 * Gets uppercase version of the argument string.
4486 * Casing is dependent on the argument locale and context-sensitive.
4487 * @param locale which string is to be converted in
4488 * @param str source string to be performed on
4489 * @return uppercase version of the argument string
4492 public static String toUpperCase(ULocale locale, String str) {
4493 StringContextIterator iter = new StringContextIterator(str);
4494 StringBuffer result = new StringBuffer(str.length());
4495 int[] locCache = new int[1];
4498 if (locale == null) {
4499 locale = ULocale.getDefault();
4503 while((c=iter.nextCaseMapCP())>=0) {
4504 c=gCsp.toFullUpper(c, iter, result, locale, locCache);
4506 /* decode the result */
4508 /* (not) original code point */
4510 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4511 /* mapping already appended to result */
4513 /* } else { append single-code point mapping */
4516 result.append((char)c);
4518 UTF16.append(result, c);
4521 return result.toString();
4525 * Gets lowercase version of the argument string.
4526 * Casing is dependent on the argument locale and context-sensitive
4527 * @param locale which string is to be converted in
4528 * @param str source string to be performed on
4529 * @return lowercase version of the argument string
4532 public static String toLowerCase(Locale locale, String str)
4534 return toLowerCase(ULocale.forLocale(locale), str);
4538 * Gets lowercase version of the argument string.
4539 * Casing is dependent on the argument locale and context-sensitive
4540 * @param locale which string is to be converted in
4541 * @param str source string to be performed on
4542 * @return lowercase version of the argument string
4545 public static String toLowerCase(ULocale locale, String str) {
4546 StringContextIterator iter = new StringContextIterator(str);
4547 StringBuffer result = new StringBuffer(str.length());
4548 int[] locCache = new int[1];
4551 if (locale == null) {
4552 locale = ULocale.getDefault();
4556 while((c=iter.nextCaseMapCP())>=0) {
4557 c=gCsp.toFullLower(c, iter, result, locale, locCache);
4559 /* decode the result */
4561 /* (not) original code point */
4563 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4564 /* mapping already appended to result */
4566 /* } else { append single-code point mapping */
4569 result.append((char)c);
4571 UTF16.append(result, c);
4574 return result.toString();
4578 * <p>Gets the titlecase version of the argument string.</p>
4579 * <p>Position for titlecasing is determined by the argument break
4580 * iterator, hence the user can customize his break iterator for
4581 * a specialized titlecasing. In this case only the forward iteration
4582 * needs to be implemented.
4583 * If the break iterator passed in is null, the default Unicode algorithm
4584 * will be used to determine the titlecase positions.
4586 * <p>Only positions returned by the break iterator will be title cased,
4587 * character in between the positions will all be in lower case.</p>
4588 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4589 * @param locale which string is to be converted in
4590 * @param str source string to be performed on
4591 * @param breakiter break iterator to determine the positions in which
4592 * the character should be title cased.
4593 * @return lowercase version of the argument string
4596 public static String toTitleCase(Locale locale, String str,
4597 BreakIterator breakiter)
4599 return toTitleCase(ULocale.forLocale(locale), str, breakiter);
4603 * <p>Gets the titlecase version of the argument string.</p>
4604 * <p>Position for titlecasing is determined by the argument break
4605 * iterator, hence the user can customize his break iterator for
4606 * a specialized titlecasing. In this case only the forward iteration
4607 * needs to be implemented.
4608 * If the break iterator passed in is null, the default Unicode algorithm
4609 * will be used to determine the titlecase positions.
4611 * <p>Only positions returned by the break iterator will be title cased,
4612 * character in between the positions will all be in lower case.</p>
4613 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4614 * @param locale which string is to be converted in
4615 * @param str source string to be performed on
4616 * @param titleIter break iterator to determine the positions in which
4617 * the character should be title cased.
4618 * @return lowercase version of the argument string
4621 public static String toTitleCase(ULocale locale, String str,
4622 BreakIterator titleIter) {
4623 return toTitleCase(locale, str, titleIter, 0);
4627 * <p>Gets the titlecase version of the argument string.</p>
4628 * <p>Position for titlecasing is determined by the argument break
4629 * iterator, hence the user can customize his break iterator for
4630 * a specialized titlecasing. In this case only the forward iteration
4631 * needs to be implemented.
4632 * If the break iterator passed in is null, the default Unicode algorithm
4633 * will be used to determine the titlecase positions.
4635 * <p>Only positions returned by the break iterator will be title cased,
4636 * character in between the positions will all be in lower case.</p>
4637 * <p>Casing is dependent on the argument locale and context-sensitive</p>
4638 * @param locale which string is to be converted in
4639 * @param str source string to be performed on
4640 * @param titleIter break iterator to determine the positions in which
4641 * the character should be title cased.
4642 * @param options bit set to modify the titlecasing operation
4643 * @return lowercase version of the argument string
4645 * @see #TITLECASE_NO_LOWERCASE
4646 * @see #TITLECASE_NO_BREAK_ADJUSTMENT
4648 public static String toTitleCase(ULocale locale, String str,
4649 BreakIterator titleIter,
4651 StringContextIterator iter = new StringContextIterator(str);
4652 StringBuffer result = new StringBuffer(str.length());
4653 int[] locCache = new int[1];
4654 int c, nc, srcLength = str.length();
4656 if (locale == null) {
4657 locale = ULocale.getDefault();
4661 if(titleIter == null) {
4662 titleIter = BreakIterator.getWordInstance(locale);
4664 titleIter.setText(str);
4666 int prev, titleStart, index;
4667 boolean isFirstIndex;
4668 boolean isDutch = locale.getLanguage().equals("nl");
4669 boolean FirstIJ = true;
4671 /* set up local variables */
4675 /* titlecasing loop */
4676 while(prev<srcLength) {
4677 /* find next index where to titlecase */
4680 index=titleIter.first();
4682 index=titleIter.next();
4684 if(index==BreakIterator.DONE || index>srcLength) {
4689 * Unicode 4 & 5 section 3.13 Default Case Operations:
4691 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
4692 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
4693 * cased character F. If F exists, map F to default_title(F); then map each
4694 * subsequent character C to default_lower(C).
4696 * In this implementation, segment [prev..index[ into 3 parts:
4697 * a) uncased characters (copy as-is) [prev..titleStart[
4698 * b) first case letter (titlecase) [titleStart..titleLimit[
4699 * c) subsequent characters (lowercase) [titleLimit..index[
4702 /* find and copy uncased characters [prev..titleStart[ */
4703 iter.setLimit(index);
4704 c=iter.nextCaseMapCP();
4705 if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCaseProps.NONE==gCsp.getType(c)) {
4706 while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
4707 titleStart=iter.getCPStart();
4708 if(prev<titleStart) {
4709 // TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
4710 result.append(str.substring(prev, titleStart));
4716 if(titleStart<index) {
4718 /* titlecase c which is from titleStart */
4719 c=gCsp.toFullTitle(c, iter, result, locale, locCache);
4721 /* decode the result and lowercase up to index */
4724 /* (not) original code point */
4727 result.append((char)c);
4729 UTF16.append(result, c);
4731 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4732 /* mapping already appended to result */
4734 /* append single-code point mapping */
4736 result.append((char)c);
4738 UTF16.append(result, c);
4742 if((options&TITLECASE_NO_LOWERCASE)!=0) {
4743 /* Optionally just copy the rest of the word unchanged. */
4745 int titleLimit=iter.getCPLimit();
4746 if(titleLimit<index) {
4747 // TODO: With Java 5, this would want to be result.append(str, titleLimit, index);
4748 String appendStr = str.substring(titleLimit,index);
4749 /* Special Case - Dutch IJ Titlecasing */
4750 if ( isDutch && c == 0x0049 && appendStr.startsWith("j")) {
4751 appendStr = "J" + appendStr.substring(1);
4753 result.append(appendStr);
4757 } else if((nc=iter.nextCaseMapCP())>=0) {
4758 if ( isDutch && ( nc == 0x004A || nc == 0x006A ) && ( c == 0x0049 ) && ( FirstIJ == true )) {
4762 /* Normal operation: Lowercase the rest of the word. */
4763 c=gCsp.toFullLower(nc, iter, result, locale, locCache);
4774 return result.toString();
4778 * The given character is mapped to its case folding equivalent according
4779 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4780 * folding equivalent, the character itself is returned.
4782 * <p>This function only returns the simple, single-code point case mapping.
4783 * Full case mappings should be used whenever possible because they produce
4784 * better results by working on whole strings.
4785 * They can map to a result string with a different length as appropriate.
4786 * Full case mappings are applied by the case mapping functions
4787 * that take String parameters rather than code points (int).
4788 * See also the User Guide chapter on C/POSIX migration:
4789 * http://www.icu-project.org/userguide/posix.html#case_mappings
4791 * @param ch the character to be converted
4792 * @param defaultmapping Indicates if all mappings defined in
4793 * CaseFolding.txt is to be used, otherwise the
4794 * mappings for dotted I and dotless i marked with
4795 * 'I' in CaseFolding.txt will be skipped.
4796 * @return the case folding equivalent of the character, if
4797 * any; otherwise the character itself.
4798 * @see #foldCase(String, boolean)
4801 public static int foldCase(int ch, boolean defaultmapping) {
4802 return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
4806 * The given string is mapped to its case folding equivalent according to
4807 * UnicodeData.txt and CaseFolding.txt; if any character has no case
4808 * folding equivalent, the character itself is returned.
4809 * "Full", multiple-code point case folding mappings are returned here.
4810 * For "simple" single-code point mappings use the API
4811 * foldCase(int ch, boolean defaultmapping).
4812 * @param str the String to be converted
4813 * @param defaultmapping Indicates if all mappings defined in
4814 * CaseFolding.txt is to be used, otherwise the
4815 * mappings for dotted I and dotless i marked with
4816 * 'I' in CaseFolding.txt will be skipped.
4817 * @return the case folding equivalent of the character, if
4818 * any; otherwise the character itself.
4819 * @see #foldCase(int, boolean)
4822 public static String foldCase(String str, boolean defaultmapping) {
4823 return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
4827 * Option value for case folding: use default mappings defined in CaseFolding.txt.
4830 public static final int FOLD_CASE_DEFAULT = 0x0000;
4832 * Option value for case folding: exclude the mappings for dotted I
4833 * and dotless i marked with 'I' in CaseFolding.txt.
4836 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001;
4839 * The given character is mapped to its case folding equivalent according
4840 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
4841 * folding equivalent, the character itself is returned.
4843 * <p>This function only returns the simple, single-code point case mapping.
4844 * Full case mappings should be used whenever possible because they produce
4845 * better results by working on whole strings.
4846 * They can map to a result string with a different length as appropriate.
4847 * Full case mappings are applied by the case mapping functions
4848 * that take String parameters rather than code points (int).
4849 * See also the User Guide chapter on C/POSIX migration:
4850 * http://www.icu-project.org/userguide/posix.html#case_mappings
4852 * @param ch the character to be converted
4853 * @param options A bit set for special processing. Currently the recognised options are
4854 * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4855 * @return the case folding equivalent of the character, if
4856 * any; otherwise the character itself.
4857 * @see #foldCase(String, boolean)
4860 public static int foldCase(int ch, int options) {
4861 return gCsp.fold(ch, options);
4865 * The given string is mapped to its case folding equivalent according to
4866 * UnicodeData.txt and CaseFolding.txt; if any character has no case
4867 * folding equivalent, the character itself is returned.
4868 * "Full", multiple-code point case folding mappings are returned here.
4869 * For "simple" single-code point mappings use the API
4870 * foldCase(int ch, boolean defaultmapping).
4871 * @param str the String to be converted
4872 * @param options A bit set for special processing. Currently the recognised options are
4873 * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
4874 * @return the case folding equivalent of the character, if
4875 * any; otherwise the character itself.
4876 * @see #foldCase(int, boolean)
4879 public static final String foldCase(String str, int options) {
4880 StringBuffer result = new StringBuffer(str.length());
4883 length = str.length();
4884 for(i=0; i<length;) {
4885 c=UTF16.charAt(str, i);
4886 i+=UTF16.getCharCount(c);
4887 c=gCsp.toFullFolding(c, result, options);
4889 /* decode the result */
4891 /* (not) original code point */
4893 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
4894 /* mapping already appended to result */
4896 /* } else { append single-code point mapping */
4899 result.append((char)c);
4901 UTF16.append(result, c);
4904 return result.toString();
4908 * Return numeric value of Han code points.
4909 * <br> This returns the value of Han 'numeric' code points,
4910 * including those for zero, ten, hundred, thousand, ten thousand,
4911 * and hundred million.
4912 * This includes both the standard and 'checkwriting'
4913 * characters, the 'big circle' zero character, and the standard
4915 * @param ch code point to query
4916 * @return value if it is a Han 'numeric character,' otherwise return -1.
4919 public static int getHanNumericValue(int ch)
4921 // TODO: Are these all covered by Unicode numeric value data?
4924 case IDEOGRAPHIC_NUMBER_ZERO_ :
4925 case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
4926 return 0; // Han Zero
4927 case CJK_IDEOGRAPH_FIRST_ :
4928 case CJK_IDEOGRAPH_COMPLEX_ONE_ :
4929 return 1; // Han One
4930 case CJK_IDEOGRAPH_SECOND_ :
4931 case CJK_IDEOGRAPH_COMPLEX_TWO_ :
4932 return 2; // Han Two
4933 case CJK_IDEOGRAPH_THIRD_ :
4934 case CJK_IDEOGRAPH_COMPLEX_THREE_ :
4935 return 3; // Han Three
4936 case CJK_IDEOGRAPH_FOURTH_ :
4937 case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
4938 return 4; // Han Four
4939 case CJK_IDEOGRAPH_FIFTH_ :
4940 case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
4941 return 5; // Han Five
4942 case CJK_IDEOGRAPH_SIXTH_ :
4943 case CJK_IDEOGRAPH_COMPLEX_SIX_ :
4944 return 6; // Han Six
4945 case CJK_IDEOGRAPH_SEVENTH_ :
4946 case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
4947 return 7; // Han Seven
4948 case CJK_IDEOGRAPH_EIGHTH_ :
4949 case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
4950 return 8; // Han Eight
4951 case CJK_IDEOGRAPH_NINETH_ :
4952 case CJK_IDEOGRAPH_COMPLEX_NINE_ :
4953 return 9; // Han Nine
4954 case CJK_IDEOGRAPH_TEN_ :
4955 case CJK_IDEOGRAPH_COMPLEX_TEN_ :
4957 case CJK_IDEOGRAPH_HUNDRED_ :
4958 case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
4960 case CJK_IDEOGRAPH_THOUSAND_ :
4961 case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
4963 case CJK_IDEOGRAPH_TEN_THOUSAND_ :
4965 case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
4968 return -1; // no value
4972 * <p>Gets an iterator for character types, iterating over codepoints.</p>
4973 * Example of use:<br>
4975 * RangeValueIterator iterator = UCharacter.getTypeIterator();
4976 * RangeValueIterator.Element element = new RangeValueIterator.Element();
4977 * while (iterator.next(element)) {
4978 * System.out.println("Codepoint \\u" +
4979 * Integer.toHexString(element.start) +
4980 * " to codepoint \\u" +
4981 * Integer.toHexString(element.limit - 1) +
4982 * " has the character type " +
4986 * @return an iterator
4989 public static RangeValueIterator getTypeIterator()
4991 return new UCharacterTypeIterator(PROPERTY_);
4995 * <p>Gets an iterator for character names, iterating over codepoints.</p>
4996 * <p>This API only gets the iterator for the modern, most up-to-date
4997 * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
4998 * for extended names use getExtendedNameIterator().</p>
4999 * Example of use:<br>
5001 * ValueIterator iterator = UCharacter.getNameIterator();
5002 * ValueIterator.Element element = new ValueIterator.Element();
5003 * while (iterator.next(element)) {
5004 * System.out.println("Codepoint \\u" +
5005 * Integer.toHexString(element.codepoint) +
5006 * " has the name " + (String)element.value);
5009 * <p>The maximal range which the name iterator iterates is from
5010 * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
5011 * @return an iterator
5014 public static ValueIterator getNameIterator()
5017 throw new RuntimeException("Could not load unames.icu");
5019 return new UCharacterNameIterator(NAME_,
5020 UCharacterNameChoice.UNICODE_CHAR_NAME);
5024 * <p>Gets an iterator for character names, iterating over codepoints.</p>
5025 * <p>This API only gets the iterator for the older 1.0 Unicode names.
5026 * For modern, most up-to-date Unicode names use getNameIterator() or
5027 * for extended names use getExtendedNameIterator().</p>
5028 * Example of use:<br>
5030 * ValueIterator iterator = UCharacter.get1_0NameIterator();
5031 * ValueIterator.Element element = new ValueIterator.Element();
5032 * while (iterator.next(element)) {
5033 * System.out.println("Codepoint \\u" +
5034 * Integer.toHexString(element.codepoint) +
5035 * " has the name " + (String)element.value);
5038 * <p>The maximal range which the name iterator iterates is from
5039 * @return an iterator
5042 public static ValueIterator getName1_0Iterator()
5045 throw new RuntimeException("Could not load unames.icu");
5047 return new UCharacterNameIterator(NAME_,
5048 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
5052 * <p>Gets an iterator for character names, iterating over codepoints.</p>
5053 * <p>This API only gets the iterator for the extended names.
5054 * For modern, most up-to-date Unicode names use getNameIterator() or
5055 * for older 1.0 Unicode names use get1_0NameIterator().</p>
5056 * Example of use:<br>
5058 * ValueIterator iterator = UCharacter.getExtendedNameIterator();
5059 * ValueIterator.Element element = new ValueIterator.Element();
5060 * while (iterator.next(element)) {
5061 * System.out.println("Codepoint \\u" +
5062 * Integer.toHexString(element.codepoint) +
5063 * " has the name " + (String)element.value);
5066 * <p>The maximal range which the name iterator iterates is from
5067 * @return an iterator
5070 public static ValueIterator getExtendedNameIterator()
5073 throw new MissingResourceException("Could not load unames.icu","","");
5075 return new UCharacterNameIterator(NAME_,
5076 UCharacterNameChoice.EXTENDED_CHAR_NAME);
5080 * <p>Get the "age" of the code point.</p>
5081 * <p>The "age" is the Unicode version when the code point was first
5082 * designated (as a non-character or for Private Use) or assigned a
5084 * <p>This can be useful to avoid emitting code points to receiving
5085 * processes that do not accept newer characters.</p>
5086 * <p>The data is from the UCD file DerivedAge.txt.</p>
5087 * @param ch The code point.
5088 * @return the Unicode version number
5091 public static VersionInfo getAge(int ch)
5093 if (ch < MIN_VALUE || ch > MAX_VALUE) {
5094 throw new IllegalArgumentException("Codepoint out of bounds");
5096 return PROPERTY_.getAge(ch);
5100 * <p>Check a binary Unicode property for a code point.</p>
5101 * <p>Unicode, especially in version 3.2, defines many more properties
5102 * than the original set in UnicodeData.txt.</p>
5103 * <p>This API is intended to reflect Unicode properties as defined in
5104 * the Unicode Character Database (UCD) and Unicode Technical Reports
5106 * <p>For details about the properties see
5107 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
5108 * <p>For names of Unicode properties see the UCD file
5109 * PropertyAliases.txt.</p>
5110 * <p>This API does not check the validity of the codepoint.</p>
5111 * <p>Important: If ICU is built with UCD files from Unicode versions
5112 * below 3.2, then properties marked with "new" are not or
5113 * not fully available.</p>
5114 * @param ch code point to test.
5115 * @param property selector constant from com.ibm.icu.lang.UProperty,
5116 * identifies which binary property to check.
5117 * @return true or false according to the binary Unicode property value
5118 * for ch. Also false if property is out of bounds or if the
5119 * Unicode version does not have data for the property at all, or
5120 * not for this code point.
5121 * @see com.ibm.icu.lang.UProperty
5124 public static boolean hasBinaryProperty(int ch, int property)
5126 if (ch < MIN_VALUE || ch > MAX_VALUE) {
5127 throw new IllegalArgumentException("Codepoint out of bounds");
5129 return PROPERTY_.hasBinaryProperty(ch, property);
5133 * <p>Check if a code point has the Alphabetic Unicode property.</p>
5134 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
5135 * <p>Different from UCharacter.isLetter(ch)!</p>
5137 * @param ch codepoint to be tested
5139 public static boolean isUAlphabetic(int ch)
5141 return hasBinaryProperty(ch, UProperty.ALPHABETIC);
5145 * <p>Check if a code point has the Lowercase Unicode property.</p>
5146 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
5147 * <p>This is different from UCharacter.isLowerCase(ch)!</p>
5148 * @param ch codepoint to be tested
5151 public static boolean isULowercase(int ch)
5153 return hasBinaryProperty(ch, UProperty.LOWERCASE);
5157 * <p>Check if a code point has the Uppercase Unicode property.</p>
5158 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
5159 * <p>This is different from UCharacter.isUpperCase(ch)!</p>
5160 * @param ch codepoint to be tested
5163 public static boolean isUUppercase(int ch)
5165 return hasBinaryProperty(ch, UProperty.UPPERCASE);
5169 * <p>Check if a code point has the White_Space Unicode property.</p>
5170 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
5171 * <p>This is different from both UCharacter.isSpace(ch) and
5172 * UCharacter.isWhitespace(ch)!</p>
5173 * @param ch codepoint to be tested
5176 public static boolean isUWhiteSpace(int ch)
5178 return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
5183 * <p>Gets the property value for an Unicode property type of a code point.
5184 * Also returns binary and mask property values.</p>
5185 * <p>Unicode, especially in version 3.2, defines many more properties than
5186 * the original set in UnicodeData.txt.</p>
5187 * <p>The properties APIs are intended to reflect Unicode properties as
5188 * defined in the Unicode Character Database (UCD) and Unicode Technical
5189 * Reports (UTR). For details about the properties see
5190 * http://www.unicode.org/.</p>
5191 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
5195 * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
5196 * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
5197 * boolean b = (ideo == 1) ? true : false;
5199 * @param ch code point to test.
5200 * @param type UProperty selector constant, identifies which binary
5201 * property to check. Must be
5202 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5203 * UProperty.INT_START <= type < UProperty.INT_LIMIT or
5204 * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
5205 * @return numeric value that is directly the property value or,
5206 * for enumerated properties, corresponds to the numeric value of
5207 * the enumerated constant of the respective property value
5208 * enumeration type (cast to enum type if necessary).
5209 * Returns 0 or 1 (for false / true) for binary Unicode properties.
5210 * Returns a bit-mask for mask properties.
5211 * Returns 0 if 'type' is out of bounds or if the Unicode version
5212 * does not have data for the property at all, or not for this code
5215 * @see #hasBinaryProperty
5216 * @see #getIntPropertyMinValue
5217 * @see #getIntPropertyMaxValue
5218 * @see #getUnicodeVersion
5221 public static int getIntPropertyValue(int ch, int type)
5223 if (type < UProperty.BINARY_START) {
5224 return 0; // undefined
5226 else if (type < UProperty.BINARY_LIMIT) {
5227 return hasBinaryProperty(ch, type) ? 1 : 0;
5229 else if (type < UProperty.INT_START) {
5230 return 0; // undefined
5232 else if (type < UProperty.INT_LIMIT) {
5235 case UProperty.BIDI_CLASS:
5236 return getDirection(ch);
5237 case UProperty.BLOCK:
5238 return UnicodeBlock.idOf(ch);
5239 case UProperty.CANONICAL_COMBINING_CLASS:
5240 return getCombiningClass(ch);
5241 case UProperty.DECOMPOSITION_TYPE:
5242 return PROPERTY_.getAdditional(ch, 2)
5243 & DECOMPOSITION_TYPE_MASK_;
5244 case UProperty.EAST_ASIAN_WIDTH:
5245 return (PROPERTY_.getAdditional(ch, 0)
5246 & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
5247 case UProperty.GENERAL_CATEGORY:
5249 case UProperty.JOINING_GROUP:
5250 return gBdp.getJoiningGroup(ch);
5251 case UProperty.JOINING_TYPE:
5252 return gBdp.getJoiningType(ch);
5253 case UProperty.LINE_BREAK:
5254 return (int)(PROPERTY_.getAdditional(ch, LB_VWORD)& LB_MASK)>>LB_SHIFT;
5255 case UProperty.NUMERIC_TYPE:
5256 type=getNumericType(PROPERTY_.getProperty(ch));
5257 if(type>NumericType.NUMERIC) {
5258 /* keep internal variants of NumericType.NUMERIC from becoming visible */
5259 type=NumericType.NUMERIC;
5262 case UProperty.SCRIPT:
5263 return UScript.getScript(ch);
5264 case UProperty.HANGUL_SYLLABLE_TYPE:
5265 /* purely algorithmic; hardcode known characters, check for assigned new ones */
5266 if(ch<NormalizerImpl.JAMO_L_BASE) {
5268 } else if(ch<=0x11ff) {
5271 /* Jamo L range, HANGUL CHOSEONG ... */
5272 if(ch==0x115f || ch<=0x1159 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
5273 return HangulSyllableType.LEADING_JAMO;
5275 } else if(ch<=0x11a7) {
5276 /* Jamo V range, HANGUL JUNGSEONG ... */
5277 if(ch<=0x11a2 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
5278 return HangulSyllableType.VOWEL_JAMO;
5282 if(ch<=0x11f9 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
5283 return HangulSyllableType.TRAILING_JAMO;
5286 } else if((ch-=NormalizerImpl.HANGUL_BASE)<0) {
5288 } else if(ch<NormalizerImpl.HANGUL_COUNT) {
5289 /* Hangul syllable */
5290 return ch%NormalizerImpl.JAMO_T_COUNT==0 ? HangulSyllableType.LV_SYLLABLE : HangulSyllableType.LVT_SYLLABLE;
5294 case UProperty.NFD_QUICK_CHECK:
5295 case UProperty.NFKD_QUICK_CHECK:
5296 case UProperty.NFC_QUICK_CHECK:
5297 case UProperty.NFKC_QUICK_CHECK:
5298 return NormalizerImpl.quickCheck(ch, (type-UProperty.NFD_QUICK_CHECK)+2); // 2=UNORM_NFD
5299 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
5300 return NormalizerImpl.getFCD16(ch)>>8;
5301 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
5302 return NormalizerImpl.getFCD16(ch)&0xff;
5303 case UProperty.GRAPHEME_CLUSTER_BREAK:
5304 return (int)(PROPERTY_.getAdditional(ch, 2)& GCB_MASK)>>GCB_SHIFT;
5305 case UProperty.SENTENCE_BREAK:
5306 return (int)(PROPERTY_.getAdditional(ch, 2)& SB_MASK)>>SB_SHIFT;
5307 case UProperty.WORD_BREAK:
5308 return (int)(PROPERTY_.getAdditional(ch, 2)& WB_MASK)>>WB_SHIFT;
5311 return 0; /* undefined */
5313 } else if (type == UProperty.GENERAL_CATEGORY_MASK) {
5314 return UCharacterProperty.getMask(getType(ch));
5316 return 0; // undefined
5319 * Returns a string version of the property value.
5320 * @param propertyEnum
5323 * @return value as string
5325 * @deprecated This API is ICU internal only.
5327 public static String getStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) {
5328 // TODO some of these are less efficient, since a string is forced!
5329 if ((propertyEnum >= UProperty.BINARY_START && propertyEnum < UProperty.BINARY_LIMIT) ||
5330 (propertyEnum >= UProperty.INT_START && propertyEnum < UProperty.INT_LIMIT)) {
5331 return getPropertyValueName(propertyEnum, getIntPropertyValue(codepoint, propertyEnum), nameChoice);
5333 if (propertyEnum == UProperty.NUMERIC_VALUE) {
5334 return String.valueOf(getUnicodeNumericValue(codepoint));
5336 // otherwise must be string property
5337 switch (propertyEnum) {
5338 case UProperty.AGE: return getAge(codepoint).toString();
5339 case UProperty.ISO_COMMENT: return getISOComment(codepoint);
5340 case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(getMirror(codepoint));
5341 case UProperty.CASE_FOLDING: return foldCase(UTF16.valueOf(codepoint), true);
5342 case UProperty.LOWERCASE_MAPPING: return toLowerCase(UTF16.valueOf(codepoint));
5343 case UProperty.NAME: return getName(codepoint);
5344 case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(foldCase(codepoint,true));
5345 case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(toLowerCase(codepoint));
5346 case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(toTitleCase(codepoint));
5347 case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(toUpperCase(codepoint));
5348 case UProperty.TITLECASE_MAPPING: return toTitleCase(UTF16.valueOf(codepoint),null);
5349 case UProperty.UNICODE_1_NAME: return getName1_0(codepoint);
5350 case UProperty.UPPERCASE_MAPPING: return toUpperCase(UTF16.valueOf(codepoint));
5352 throw new IllegalArgumentException("Illegal Property Enum");
5356 * Get the minimum value for an integer/binary Unicode property type.
5357 * Can be used together with UCharacter.getIntPropertyMaxValue(int)
5358 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5359 * @param type UProperty selector constant, identifies which binary
5360 * property to check. Must be
5361 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5362 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5363 * @return Minimum value returned by UCharacter.getIntPropertyValue(int)
5364 * for a Unicode property. 0 if the property
5365 * selector 'type' is out of range.
5367 * @see #hasBinaryProperty
5368 * @see #getUnicodeVersion
5369 * @see #getIntPropertyMaxValue
5370 * @see #getIntPropertyValue
5373 public static int getIntPropertyMinValue(int type)
5376 return 0; // undefined; and: all other properties have a minimum value
5382 * Get the maximum value for an integer/binary Unicode property.
5383 * Can be used together with UCharacter.getIntPropertyMinValue(int)
5384 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
5385 * Examples for min/max values (for Unicode 3.2):
5387 * <li> UProperty.BIDI_CLASS: 0/18 (UCharacterDirection.LEFT_TO_RIGHT/UCharacterDirection.BOUNDARY_NEUTRAL)
5388 * <li> UProperty.SCRIPT: 0/45 (UScript.COMMON/UScript.TAGBANWA)
5389 * <li> UProperty.IDEOGRAPHIC: 0/1 (false/true)
5391 * For undefined UProperty constant values, min/max values will be 0/-1.
5392 * @param type UProperty selector constant, identifies which binary
5393 * property to check. Must be
5394 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
5395 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
5396 * @return Maximum value returned by u_getIntPropertyValue for a Unicode
5397 * property. <= 0 if the property selector 'type' is out of range.
5399 * @see #hasBinaryProperty
5400 * @see #getUnicodeVersion
5401 * @see #getIntPropertyMaxValue
5402 * @see #getIntPropertyValue
5405 public static int getIntPropertyMaxValue(int type)
5407 if (type < UProperty.BINARY_START) {
5408 return -1; // undefined
5410 else if (type < UProperty.BINARY_LIMIT) {
5411 return 1; // maximum TRUE for all binary properties
5413 else if (type < UProperty.INT_START) {
5414 return -1; // undefined
5416 else if (type < UProperty.INT_LIMIT) {
5418 case UProperty.BIDI_CLASS:
5419 case UProperty.JOINING_GROUP:
5420 case UProperty.JOINING_TYPE:
5421 return gBdp.getMaxValue(type);
5422 case UProperty.BLOCK:
5423 return (PROPERTY_.getMaxValues(0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
5424 case UProperty.CANONICAL_COMBINING_CLASS:
5425 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
5426 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
5427 return 0xff; // TODO do we need to be more precise,
5428 // getting the actual maximum?
5429 case UProperty.DECOMPOSITION_TYPE:
5430 return PROPERTY_.getMaxValues(2) & DECOMPOSITION_TYPE_MASK_;
5431 case UProperty.EAST_ASIAN_WIDTH:
5432 return (PROPERTY_.getMaxValues(0) & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
5433 case UProperty.GENERAL_CATEGORY:
5434 return UCharacterCategory.CHAR_CATEGORY_COUNT - 1;
5435 case UProperty.LINE_BREAK:
5436 return (PROPERTY_.getMaxValues(LB_VWORD) & LB_MASK) >> LB_SHIFT;
5437 case UProperty.NUMERIC_TYPE:
5438 return NumericType.COUNT - 1;
5439 case UProperty.SCRIPT:
5440 return PROPERTY_.getMaxValues(0) & SCRIPT_MASK_;
5441 case UProperty.HANGUL_SYLLABLE_TYPE:
5442 return HangulSyllableType.COUNT-1;
5443 case UProperty.NFD_QUICK_CHECK:
5444 case UProperty.NFKD_QUICK_CHECK:
5445 return 1; // YES -- these are never "maybe", only "no" or "yes"
5446 case UProperty.NFC_QUICK_CHECK:
5447 case UProperty.NFKC_QUICK_CHECK:
5449 case UProperty.GRAPHEME_CLUSTER_BREAK:
5450 return (PROPERTY_.getMaxValues(2) & GCB_MASK) >> GCB_SHIFT;
5451 case UProperty.SENTENCE_BREAK:
5452 return (PROPERTY_.getMaxValues(2) & SB_MASK) >> SB_SHIFT;
5453 case UProperty.WORD_BREAK:
5454 return (PROPERTY_.getMaxValues(2) & WB_MASK) >> WB_SHIFT;
5456 return -1; // undefined
5460 return -1; // undefined
5464 * Provide the java.lang.Character forDigit API, for convenience.
5467 public static char forDigit(int digit, int radix) {
5468 return java.lang.Character.forDigit(digit, radix);
5471 // JDK 1.5 API coverage
5474 * Cover the JDK 1.5 API, for convenience.
5475 * @see UTF16#LEAD_SURROGATE_MIN_VALUE
5478 public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
5481 * Cover the JDK 1.5 API, for convenience.
5482 * @see UTF16#LEAD_SURROGATE_MAX_VALUE
5485 public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
5488 * Cover the JDK 1.5 API, for convenience.
5489 * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
5492 public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
5495 * Cover the JDK 1.5 API, for convenience.
5496 * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
5499 public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
5502 * Cover the JDK 1.5 API, for convenience.
5503 * @see UTF16#SURROGATE_MIN_VALUE
5506 public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
5509 * Cover the JDK 1.5 API, for convenience.
5510 * @see UTF16#SURROGATE_MAX_VALUE
5513 public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
5516 * Cover the JDK 1.5 API, for convenience.
5517 * @see UTF16#SUPPLEMENTARY_MIN_VALUE
5520 public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
5523 * Cover the JDK 1.5 API, for convenience.
5524 * @see UTF16#CODEPOINT_MAX_VALUE
5527 public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
5530 * Cover the JDK 1.5 API, for convenience.
5531 * @see UTF16#CODEPOINT_MIN_VALUE
5534 public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
5537 * Cover the JDK 1.5 API, for convenience.
5538 * @param cp the code point to check
5539 * @return true if cp is a valid code point
5542 public static final boolean isValidCodePoint(int cp) {
5543 return cp >= 0 && cp <= MAX_CODE_POINT;
5547 * Cover the JDK 1.5 API, for convenience.
5548 * @param cp the code point to check
5549 * @return true if cp is a supplementary code point
5552 public static final boolean isSupplementaryCodePoint(int cp) {
5553 return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
5554 && cp <= UTF16.CODEPOINT_MAX_VALUE;
5558 * Cover the JDK 1.5 API, for convenience.
5559 * @param ch the char to check
5560 * @return true if ch is a high (lead) surrogate
5563 public static boolean isHighSurrogate(char ch) {
5564 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
5568 * Cover the JDK 1.5 API, for convenience.
5569 * @param ch the char to check
5570 * @return true if ch is a low (trail) surrogate
5573 public static boolean isLowSurrogate(char ch) {
5574 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
5578 * Cover the JDK 1.5 API, for convenience. Return true if the chars
5579 * form a valid surrogate pair.
5580 * @param high the high (lead) char
5581 * @param low the low (trail) char
5582 * @return true if high, low form a surrogate pair
5585 public static final boolean isSurrogatePair(char high, char low) {
5586 return isHighSurrogate(high) && isLowSurrogate(low);
5590 * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
5591 * to represent the code point. This does not check the
5592 * code point for validity.
5593 * @param cp the code point to check
5594 * @return the number of chars needed to represent the code point
5595 * @see UTF16#getCharCount
5598 public static int charCount(int cp) {
5599 return UTF16.getCharCount(cp);
5603 * Cover the JDK 1.5 API, for convenience. Return the code point represented by
5604 * the characters. This does not check the surrogate pair for validity.
5605 * @param high the high (lead) surrogate
5606 * @param low the low (trail) surrogate
5607 * @return the code point formed by the surrogate pair
5610 public static final int toCodePoint(char high, char low) {
5611 return UCharacterProperty.getRawSupplementary(high, low);
5615 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5616 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5617 * API. This examines only the characters at index and index+1.
5618 * @param seq the characters to check
5619 * @param index the index of the first or only char forming the code point
5620 * @return the code point at the index
5623 //#if defined(FOUNDATION10) || defined(J2SE13)
5624 //## public static final int codePointAt(String seq, int index) {
5625 //## char c1 = seq.charAt(index++);
5626 //## if (isHighSurrogate(c1)) {
5627 //## if (index < seq.length()) {
5628 //## char c2 = seq.charAt(index);
5629 //## if (isLowSurrogate(c2)) {
5630 //## return toCodePoint(c1, c2);
5636 //## public static final int codePointAt(StringBuffer seq, int index) {
5637 //## return codePointAt(seq.toString(), index);
5640 //#if defined(ECLIPSE_FRAGMENT)
5641 //## public static final int codePointAt(String seq, int index) {
5642 //## return codePointAt((CharSequence)seq, index);
5644 //## public static final int codePointAt(StringBuffer seq, int index) {
5645 //## return codePointAt((CharSequence)seq, index);
5648 public static final int codePointAt(CharSequence seq, int index) {
5649 char c1 = seq.charAt(index++);
5650 if (isHighSurrogate(c1)) {
5651 if (index < seq.length()) {
5652 char c2 = seq.charAt(index);
5653 if (isLowSurrogate(c2)) {
5654 return toCodePoint(c1, c2);
5663 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5664 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5665 * API. This examines only the characters at index and index+1.
5666 * @param text the characters to check
5667 * @param index the index of the first or only char forming the code point
5668 * @return the code point at the index
5671 public static final int codePointAt(char[] text, int index) {
5672 char c1 = text[index++];
5673 if (isHighSurrogate(c1)) {
5674 if (index < text.length) {
5675 char c2 = text[index];
5676 if (isLowSurrogate(c2)) {
5677 return toCodePoint(c1, c2);
5685 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
5686 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5687 * API. This examines only the characters at index and index+1.
5688 * @param text the characters to check
5689 * @param index the index of the first or only char forming the code point
5690 * @param limit the limit of the valid text
5691 * @return the code point at the index
5694 public static final int codePointAt(char[] text, int index, int limit) {
5695 if (index >= limit || limit > text.length) {
5696 throw new IndexOutOfBoundsException();
5698 char c1 = text[index++];
5699 if (isHighSurrogate(c1)) {
5700 if (index < limit) {
5701 char c2 = text[index];
5702 if (isLowSurrogate(c2)) {
5703 return toCodePoint(c1, c2);
5711 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5712 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5713 * API. This examines only the characters at index-1 and index-2.
5714 * @param seq the characters to check
5715 * @param index the index after the last or only char forming the code point
5716 * @return the code point before the index
5719 //#if defined(FOUNDATION10) || defined(J2SE13)
5720 //## public static final int codePointBefore(String seq, int index) {
5721 //## char c2 = seq.charAt(--index);
5722 //## if (isLowSurrogate(c2)) {
5723 //## if (index > 0) {
5724 //## char c1 = seq.charAt(--index);
5725 //## if (isHighSurrogate(c1)) {
5726 //## return toCodePoint(c1, c2);
5732 //## public static final int codePointBefore(StringBuffer seq, int index) {
5733 //## return codePointBefore(seq.toString(), index);
5736 //#if defined(ECLIPSE_FRAGMENT)
5737 //## public static final int codePointBefore(String seq, int index) {
5738 //## return codePointBefore((CharSequence)seq, index);
5740 //## public static final int codePointBefore(StringBuffer seq, int index) {
5741 //## return codePointBefore((CharSequence)seq, index);
5744 public static final int codePointBefore(CharSequence seq, int index) {
5745 char c2 = seq.charAt(--index);
5746 if (isLowSurrogate(c2)) {
5748 char c1 = seq.charAt(--index);
5749 if (isHighSurrogate(c1)) {
5750 return toCodePoint(c1, c2);
5759 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5760 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5761 * API. This examines only the characters at index-1 and index-2.
5762 * @param text the characters to check
5763 * @param index the index after the last or only char forming the code point
5764 * @return the code point before the index
5767 public static final int codePointBefore(char[] text, int index) {
5768 char c2 = text[--index];
5769 if (isLowSurrogate(c2)) {
5771 char c1 = text[--index];
5772 if (isHighSurrogate(c1)) {
5773 return toCodePoint(c1, c2);
5781 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
5782 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
5783 * API. This examines only the characters at index-1 and index-2.
5784 * @param text the characters to check
5785 * @param index the index after the last or only char forming the code point
5786 * @param limit the start of the valid text
5787 * @return the code point before the index
5790 public static final int codePointBefore(char[] text, int index, int limit) {
5791 if (index <= limit || limit < 0) {
5792 throw new IndexOutOfBoundsException();
5794 char c2 = text[--index];
5795 if (isLowSurrogate(c2)) {
5796 if (index > limit) {
5797 char c1 = text[--index];
5798 if (isHighSurrogate(c1)) {
5799 return toCodePoint(c1, c2);
5807 * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
5808 * code point into the destination at the given index.
5809 * @param cp the code point to convert
5810 * @param dst the destination array into which to put the char(s) representing the code point
5811 * @param dstIndex the index at which to put the first (or only) char
5812 * @return the count of the number of chars written (1 or 2)
5813 * @throws IllegalArgumentException if cp is not a valid code point
5816 public static final int toChars(int cp, char[] dst, int dstIndex) {
5818 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5819 dst[dstIndex] = (char)cp;
5822 if (cp <= MAX_CODE_POINT) {
5823 dst[dstIndex] = UTF16.getLeadSurrogate(cp);
5824 dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
5828 throw new IllegalArgumentException();
5832 * Cover the JDK 1.5 API, for convenience. Returns a char array
5833 * representing the code point.
5834 * @param cp the code point to convert
5835 * @return an array containing the char(s) representing the code point
5836 * @throws IllegalArgumentException if cp is not a valid code point
5839 public static final char[] toChars(int cp) {
5841 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
5842 return new char[] { (char)cp };
5844 if (cp <= MAX_CODE_POINT) {
5846 UTF16.getLeadSurrogate(cp),
5847 UTF16.getTrailSurrogate(cp)
5851 throw new IllegalArgumentException();
5855 * Cover the JDK API, for convenience. Return a byte representing the directionality of
5857 * <br/><b>Note</b>: Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined or
5858 * out-of-bounds characters. <br/><b>Note</b>: The return value must be
5859 * tested using the constants defined in {@link UCharacterEnums.ECharacterDirection}
5860 * since the values are different from the ones defined by <code>java.lang.Character</code>.
5861 * @param cp the code point to check
5862 * @return the directionality of the code point
5863 * @see #getDirection
5866 public static byte getDirectionality(int cp)
5868 return (byte)getDirection(cp);
5872 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5873 * @param text the characters to check
5874 * @param start the start of the range
5875 * @param limit the limit of the range
5876 * @return the number of code points in the range
5879 //#if defined(FOUNDATION10) || defined(J2SE13)
5880 //## public static int codePointCount(String text, int start, int limit) {
5881 //## if (start < 0 || limit < start || limit > text.length()) {
5882 //## throw new IndexOutOfBoundsException("start (" + start +
5883 //## ") or limit (" + limit +
5884 //## ") invalid or out of range 0, " + text.length());
5887 //## int len = limit - start;
5888 //## while (limit > start) {
5889 //## char ch = text.charAt(--limit);
5890 //## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
5891 //## ch = text.charAt(--limit);
5892 //## if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
5900 //## public static int codePointCount(StringBuffer text, int start, int limit) {
5901 //## return codePointCount(text.toString(), start, limit);
5904 //#if defined(ECLIPSE_FRAGMENT)
5905 //## public static int codePointCount(String text, int start, int limit) {
5906 //## return codePointCount((CharSequence)text, start, limit);
5908 //## public static int codePointCount(StringBuffer text, int start, int limit) {
5909 //## return codePointCount((CharSequence)text, start, limit);
5912 public static int codePointCount(CharSequence text, int start, int limit) {
5913 if (start < 0 || limit < start || limit > text.length()) {
5914 throw new IndexOutOfBoundsException("start (" + start +
5915 ") or limit (" + limit +
5916 ") invalid or out of range 0, " + text.length());
5919 int len = limit - start;
5920 while (limit > start) {
5921 char ch = text.charAt(--limit);
5922 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
5923 ch = text.charAt(--limit);
5924 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
5935 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
5936 * @param text the characters to check
5937 * @param start the start of the range
5938 * @param limit the limit of the range
5939 * @return the number of code points in the range
5942 public static int codePointCount(char[] text, int start, int limit) {
5943 if (start < 0 || limit < start || limit > text.length) {
5944 throw new IndexOutOfBoundsException("start (" + start +
5945 ") or limit (" + limit +
5946 ") invalid or out of range 0, " + text.length);
5949 int len = limit - start;
5950 while (limit > start) {
5951 char ch = text[--limit];
5952 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
5954 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
5964 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
5965 * @param text the characters to check
5966 * @param index the index to adjust
5967 * @param codePointOffset the number of code points by which to offset the index
5968 * @return the adjusted index
5971 //#if defined(FOUNDATION10) || defined(J2SE13)
5972 //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
5973 //## if (index < 0 || index > text.length()) {
5974 //## throw new IndexOutOfBoundsException("index ( " + index +
5975 //## ") out of range 0, " + text.length());
5978 //## if (codePointOffset < 0) {
5979 //## while (++codePointOffset <= 0) {
5980 //## char ch = text.charAt(--index);
5981 //## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
5982 //## ch = text.charAt(--index);
5983 //## if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
5984 //## if (++codePointOffset > 0) {
5985 //## return index+1;
5991 //## int limit = text.length();
5992 //## while (--codePointOffset >= 0) {
5993 //## char ch = text.charAt(index++);
5994 //## while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
5995 //## ch = text.charAt(index++);
5996 //## if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
5997 //## if (--codePointOffset < 0) {
5998 //## return index-1;
6007 //## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
6008 //## return offsetByCodePoints(text.toString(), index, codePointOffset);
6011 //#if defined(ECLIPSE_FRAGMENT)
6012 //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
6013 //## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
6015 //## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
6016 //## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
6019 public static int offsetByCodePoints(CharSequence text, int index, int codePointOffset) {
6020 if (index < 0 || index > text.length()) {
6021 throw new IndexOutOfBoundsException("index ( " + index +
6022 ") out of range 0, " + text.length());
6025 if (codePointOffset < 0) {
6026 while (++codePointOffset <= 0) {
6027 char ch = text.charAt(--index);
6028 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
6029 ch = text.charAt(--index);
6030 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
6031 if (++codePointOffset > 0) {
6038 int limit = text.length();
6039 while (--codePointOffset >= 0) {
6040 char ch = text.charAt(index++);
6041 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
6042 ch = text.charAt(index++);
6043 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
6044 if (--codePointOffset < 0) {
6057 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
6058 * @param text the characters to check
6059 * @param start the start of the range to check
6060 * @param count the length of the range to check
6061 * @param index the index to adjust
6062 * @param codePointOffset the number of code points by which to offset the index
6063 * @return the adjusted index
6066 public static int offsetByCodePoints(char[] text, int start, int count, int index, int codePointOffset) {
6067 int limit = start + count;
6068 if (start < 0 || limit < start || limit > text.length || index < start || index > limit) {
6069 throw new IndexOutOfBoundsException("index ( " + index +
6070 ") out of range " + start +
6072 " in array 0, " + text.length);
6075 if (codePointOffset < 0) {
6076 while (++codePointOffset <= 0) {
6077 char ch = text[--index];
6078 if (index < start) {
6079 throw new IndexOutOfBoundsException("index ( " + index +
6080 ") < start (" + start +
6083 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > start) {
6085 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
6086 if (++codePointOffset > 0) {
6093 while (--codePointOffset >= 0) {
6094 char ch = text[index++];
6095 if (index > limit) {
6096 throw new IndexOutOfBoundsException("index ( " + index +
6097 ") > limit (" + limit +
6100 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
6102 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
6103 if (--codePointOffset < 0) {
6114 // protected data members --------------------------------------------
6117 * Database storing the sets of character name
6119 static UCharacterName NAME_ = null;
6122 * Singleton object encapsulating the imported pnames.icu property aliases
6124 static UPropertyAliases PNAMES_ = null;
6126 // block to initialise name database and unicode 1.0 data
6129 PNAMES_ = new UPropertyAliases();
6130 NAME_ = UCharacterName.getInstance();
6131 } catch (IOException e) {
6132 // e.printStackTrace();
6133 throw new MissingResourceException(e.getMessage(),"","");
6134 //throw new RuntimeException(e.getMessage());
6135 // DONOT throw an exception
6136 // we might be building ICU modularly wothout names.icu and
6141 // private variables -------------------------------------------------
6144 * Database storing the sets of character property
6146 private static final UCharacterProperty PROPERTY_;
6150 private static final char[] PROPERTY_TRIE_INDEX_;
6151 private static final char[] PROPERTY_TRIE_DATA_;
6152 private static final int PROPERTY_INITIAL_VALUE_;
6154 private static final UCaseProps gCsp;
6155 private static final UBiDiProps gBdp;
6157 // block to initialise character property database
6162 PROPERTY_ = UCharacterProperty.getInstance();
6163 PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
6164 PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
6165 PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
6169 throw new MissingResourceException(e.getMessage(),"","");
6173 * In ICU4J 3.2, most Unicode properties were loaded from uprops.icu.
6174 * ICU4J 3.4 adds ucase.icu for case mapping properties and
6175 * ubidi.icu for bidi/shaping properties and
6176 * removes case/bidi/shaping properties from uprops.icu.
6178 * Loading of uprops.icu was always done during class loading of UCharacter.class.
6179 * In order to maintain performance for all such properties,
6180 * ucase.icu and ubidi.icu are also loaded during class loading of UCharacter.class.
6181 * It will not fail if they are missing.
6182 * These data items are loaded early to avoid having to synchronize access to them,
6183 * for thread safety and performance.
6185 * We try to load these data items at most once.
6186 * If it works, we use the resulting singleton object.
6187 * If it fails, then we get a dummy object, which always works unless
6188 * we are seriously out of memory.
6189 * After UCharacter.class loading, we have a never-changing pointer to either the
6190 * real singleton or the dummy.
6192 * This method is used in Unicode properties APIs that
6193 * do not have a service object and also do not have an error code parameter.
6194 * Other API implementations get the singleton themselves
6195 * (synchronized), store it in the service object, and report errors.
6199 csp=UCaseProps.getSingleton();
6200 } catch(IOException e) {
6201 csp=UCaseProps.getDummy();
6207 bdp=UBiDiProps.getSingleton();
6208 } catch(IOException e) {
6209 bdp=UBiDiProps.getDummy();
6215 * To get the last character out from a data type
6217 private static final int LAST_CHAR_MASK_ = 0xFFFF;
6220 // * To get the last byte out from a data type
6222 // private static final int LAST_BYTE_MASK_ = 0xFF;
6227 // private static final int SHIFT_16_ = 16;
6232 // private static final int SHIFT_24_ = 24;
6237 // private static final int DECIMAL_RADIX_ = 10;
6240 * No break space code point
6242 private static final int NO_BREAK_SPACE_ = 0xA0;
6245 * Figure space code point
6247 private static final int FIGURE_SPACE_ = 0x2007;
6250 * Narrow no break space code point
6252 private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
6255 * Ideographic number zero code point
6257 private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
6260 * CJK Ideograph, First code point
6262 private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
6265 * CJK Ideograph, Second code point
6267 private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
6270 * CJK Ideograph, Third code point
6272 private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
6275 * CJK Ideograph, Fourth code point
6277 private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
6280 * CJK Ideograph, FIFTH code point
6282 private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
6285 * CJK Ideograph, Sixth code point
6287 private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
6290 * CJK Ideograph, Seventh code point
6292 private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
6295 * CJK Ideograph, Eighth code point
6297 private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
6300 * CJK Ideograph, Nineth code point
6302 private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
6305 * Application Program command code point
6307 private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
6310 * Unit separator code point
6312 private static final int UNIT_SEPARATOR_ = 0x001F;
6317 private static final int DELETE_ = 0x007F;
6319 * ISO control character first range upper limit 0x0 - 0x1F
6321 //private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;
6323 * Shift to get numeric type
6325 private static final int NUMERIC_TYPE_SHIFT_ = 5;
6327 * Mask to get numeric type
6329 private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
6331 /* encoding of fractional and large numbers */
6332 //private static final int MAX_SMALL_NUMBER=0xff;
6334 private static final int FRACTION_NUM_SHIFT=3; /* numerator: bits 7..3 */
6335 private static final int FRACTION_DEN_MASK=7; /* denominator: bits 2..0 */
6337 //private static final int FRACTION_MAX_NUM=31;
6338 private static final int FRACTION_DEN_OFFSET=2; /* denominator values are 2..9 */
6340 //private static final int FRACTION_MIN_DEN=FRACTION_DEN_OFFSET;
6341 //private static final int FRACTION_MAX_DEN=FRACTION_MIN_DEN+FRACTION_DEN_MASK;
6343 private static final int LARGE_MANT_SHIFT=4; /* mantissa: bits 7..4 */
6344 private static final int LARGE_EXP_MASK=0xf; /* exponent: bits 3..0 */
6345 private static final int LARGE_EXP_OFFSET=2; /* regular exponents 2..17 */
6346 private static final int LARGE_EXP_OFFSET_EXTRA=18; /* extra large exponents 18..33 */
6348 //private static final int LARGE_MIN_EXP=LARGE_EXP_OFFSET;
6349 //private static final int LARGE_MAX_EXP=LARGE_MIN_EXP+LARGE_EXP_MASK;
6350 //private static final int LARGE_MAX_EXP_EXTRA=LARGE_EXP_OFFSET_EXTRA+LARGE_EXP_MASK;
6353 * Han digit characters
6355 private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
6356 private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
6357 private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
6358 private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
6359 private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
6360 private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
6361 private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
6362 private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
6363 private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
6364 private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
6365 private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
6366 private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
6367 private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
6368 private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
6369 private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
6370 private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
6371 private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
6372 private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
6375 // * Zero Width Non Joiner.
6376 // * Equivalent to icu4c ZWNJ.
6378 // private static final int ZERO_WIDTH_NON_JOINER_ = 0x200c;
6380 // * Zero Width Joiner
6381 // * Equivalent to icu4c ZWJ.
6383 // private static final int ZERO_WIDTH_JOINER_ = 0x200d;
6386 * Properties in vector word 2
6390 * 19..15 Sentence Break
6392 * 9.. 5 Grapheme Cluster Break
6393 * 4.. 0 Decomposition Type
6395 private static final int LB_MASK = 0x03f00000;
6396 private static final int LB_SHIFT = 20;
6397 private static final int LB_VWORD = 2;
6399 private static final int SB_MASK = 0x000f8000;
6400 private static final int SB_SHIFT = 15;
6402 private static final int WB_MASK = 0x00007c00;
6403 private static final int WB_SHIFT = 10;
6405 private static final int GCB_MASK = 0x000003e0;
6406 private static final int GCB_SHIFT = 5;
6409 * Integer properties mask for decomposition type.
6410 * Equivalent to icu4c UPROPS_DT_MASK.
6412 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
6415 * Properties in vector word 0
6417 * 31..24 DerivedAge version major/minor one nibble each
6419 * 19..17 East Asian Width
6425 * Integer properties mask and shift values for East Asian cell width.
6426 * Equivalent to icu4c UPROPS_EA_MASK
6428 private static final int EAST_ASIAN_MASK_ = 0x000e0000;
6430 * Integer properties mask and shift values for East Asian cell width.
6431 * Equivalent to icu4c UPROPS_EA_SHIFT
6433 private static final int EAST_ASIAN_SHIFT_ = 17;
6435 * Integer properties mask and shift values for blocks.
6436 * Equivalent to icu4c UPROPS_BLOCK_MASK
6438 private static final int BLOCK_MASK_ = 0x0001ff00;
6440 * Integer properties mask and shift values for blocks.
6441 * Equivalent to icu4c UPROPS_BLOCK_SHIFT
6443 private static final int BLOCK_SHIFT_ = 8;
6445 * Integer properties mask and shift values for scripts.
6446 * Equivalent to icu4c UPROPS_SHIFT_MASK
6448 private static final int SCRIPT_MASK_ = 0x000000ff;
6450 // private constructor -----------------------------------------------
6453 * Private constructor to prevent instantiation
6455 private UCharacter()
6459 // private methods ---------------------------------------------------
6462 * Getting the digit values of characters like 'A' - 'Z', normal,
6463 * half-width and full-width. This method assumes that the other digit
6464 * characters are checked by the calling method.
6465 * @param ch character to test
6466 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
6467 * its corresponding digit will be returned.
6469 private static int getEuropeanDigit(int ch) {
6470 if ((ch > 0x7a && ch < 0xff21)
6471 || ch < 0x41 || (ch > 0x5a && ch < 0x61)
6472 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
6476 // ch >= 0x41 or ch < 0x61
6477 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
6481 return ch + 10 - 0xff21;
6483 // ch >= 0xff41 && ch <= 0xff5a
6484 return ch + 10 - 0xff41;
6488 * Gets the numeric type of the property argument
6489 * @param props 32 bit property
6490 * @return the numeric type
6492 private static int getNumericType(int props)
6494 return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
6498 * Gets the property value at the index.
6499 * This is optimized.
6500 * Note this is alittle different from CharTrie the index m_trieData_
6501 * is never negative.
6502 * This is a duplicate of UCharacterProperty.getProperty. For optimization
6503 * purposes, this method calls the trie data directly instead of through
6504 * UCharacterProperty.getProperty.
6505 * @param ch code point whose property value is to be retrieved
6506 * @return property value of code point
6509 private static final int getProperty(int ch)
6511 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
6512 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
6513 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
6514 // BMP codepoint 0000..D7FF or DC00..FFFF
6515 try { // using try for ch < 0 is faster than using an if statement
6516 return PROPERTY_TRIE_DATA_[
6517 (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
6519 } catch (ArrayIndexOutOfBoundsException e) {
6520 return PROPERTY_INITIAL_VALUE_;
6523 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
6524 // lead surrogate D800..DBFF
6525 return PROPERTY_TRIE_DATA_[
6526 (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
6530 if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
6531 // supplementary code point 10000..10FFFF
6532 // look at the construction of supplementary characters
6533 // trail forms the ends of it.
6534 return PROPERTY_.m_trie_.getSurrogateValue(
6535 UTF16.getLeadSurrogate(ch),
6536 (char)(ch & 0x3ff));
6538 // return m_dataOffset_ if there is an error, in this case we return
6539 // the default value: m_initialValue_
6540 // we cannot assume that m_initialValue_ is at offset 0
6541 // this is for optimization.
6542 return PROPERTY_INITIAL_VALUE_;