3 *******************************************************************************
\r
4 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
5 * others. All Rights Reserved. *
\r
6 *******************************************************************************
\r
9 package com.ibm.icu.lang;
\r
11 import java.io.IOException;
\r
12 import java.lang.ref.SoftReference;
\r
13 import java.util.HashMap;
\r
14 import java.util.Locale;
\r
15 import java.util.Map;
\r
16 import java.util.MissingResourceException;
\r
18 import com.ibm.icu.impl.UBiDiProps;
\r
19 import com.ibm.icu.impl.UCaseProps;
\r
20 import com.ibm.icu.impl.NormalizerImpl;
\r
21 import com.ibm.icu.impl.UCharacterUtility;
\r
22 import com.ibm.icu.impl.UCharacterName;
\r
23 import com.ibm.icu.impl.UCharacterNameChoice;
\r
24 import com.ibm.icu.impl.UPropertyAliases;
\r
25 import com.ibm.icu.lang.UCharacterEnums.*;
\r
26 import com.ibm.icu.text.BreakIterator;
\r
27 import com.ibm.icu.text.UTF16;
\r
28 import com.ibm.icu.impl.UCharacterProperty;
\r
29 import com.ibm.icu.util.RangeValueIterator;
\r
30 import com.ibm.icu.util.ULocale;
\r
31 import com.ibm.icu.util.ValueIterator;
\r
32 import com.ibm.icu.util.VersionInfo;
\r
36 * The UCharacter class provides extensions to the
\r
37 * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
\r
38 * java.lang.Character</a> class. These extensions provide support for
\r
39 * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
\r
40 * class, provide support for supplementary characters (those with code
\r
41 * points above U+FFFF).
\r
42 * Each ICU release supports the latest version of Unicode available at that time.
\r
45 * Code points are represented in these API using ints. While it would be
\r
46 * more convenient in Java to have a separate primitive datatype for them,
\r
47 * ints suffice in the meantime.
\r
50 * To use this class please add the jar file name icu4j.jar to the
\r
51 * class path, since it contains data files which supply the information used
\r
53 * E.g. In Windows <br>
\r
54 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
\r
55 * Otherwise, another method would be to copy the files uprops.dat and
\r
56 * unames.icu from the icu4j source subdirectory
\r
57 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
\r
58 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
\r
61 * Aside from the additions for UTF-16 support, and the updated Unicode
\r
62 * properties, the main differences between UCharacter and Character are:
\r
64 * <li> UCharacter is not designed to be a char wrapper and does not have
\r
65 * APIs to which involves management of that single char.<br>
\r
68 * <li> char charValue(),
\r
69 * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
\r
71 * <li> UCharacter does not include Character APIs that are deprecated, nor
\r
72 * does it include the Java-specific character information, such as
\r
73 * boolean isJavaIdentifierPart(char ch).
\r
74 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
\r
75 * values '10' - '35'. UCharacter also does this in digit and
\r
76 * getNumericValue, to adhere to the java semantics of these
\r
77 * methods. New methods unicodeDigit, and
\r
78 * getUnicodeNumericValue do not treat the above code points
\r
79 * as having numeric values. This is a semantic change from ICU4J 1.3.1.
\r
82 * Further detail differences can be determined from the program
\r
83 * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
\r
84 * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
\r
87 * In addition to Java compatibility functions, which calculate derived properties,
\r
88 * this API provides low-level access to the Unicode Character Database.
\r
91 * Unicode assigns each code point (not just assigned character) values for
\r
93 * Most of them are simple boolean flags, or constants from a small enumerated list.
\r
94 * For some properties, values are strings or other relatively more complex types.
\r
97 * For more information see
\r
98 * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
\r
99 * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
\r
102 * There are also functions that provide easy migration from C/POSIX functions
\r
103 * like isblank(). Their use is generally discouraged because the C/POSIX
\r
104 * standards do not define their semantics beyond the ASCII range, which means
\r
105 * that different implementations exhibit very different behavior.
\r
106 * Instead, Unicode properties should be used directly.
\r
109 * There are also only a few, broad C/POSIX character classes, and they tend
\r
110 * to be used for conflicting purposes. For example, the "isalpha()" class
\r
111 * is sometimes used to determine word boundaries, while a more sophisticated
\r
112 * approach would at least distinguish initial letters from continuation
\r
113 * characters (the latter including combining marks).
\r
114 * (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
\r
115 * Another example: There is no "istitle()" class for titlecase characters.
\r
118 * ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
\r
119 * ICU implements them according to the Standard Recommendations in
\r
120 * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
\r
121 * (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
\r
124 * API access for C/POSIX character classes is as follows:
\r
125 * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
\r
126 * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
\r
127 * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
\r
128 * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
\r
129 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
\r
130 * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
\r
131 * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
\r
132 * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
\r
133 * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
\r
134 * - cntrl: getType(c)==CONTROL
\r
135 * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
\r
136 * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
\r
139 * The C/POSIX character classes are also available in UnicodeSet patterns,
\r
140 * using patterns like [:graph:] or \p{graph}.
\r
143 * Note: There are several ICU (and Java) whitespace functions.
\r
145 * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
\r
146 * most of general categories "Z" (separators) + most whitespace ISO controls
\r
147 * (including no-break spaces, but excluding IS1..IS4 and ZWSP)
\r
148 * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
\r
149 * - isSpaceChar: just Z (including no-break spaces)
\r
152 * This class is not subclassable
\r
154 * @author Syn Wee Quek
\r
156 * @see com.ibm.icu.lang.UCharacterEnums
\r
159 public final class UCharacter implements ECharacterCategory, ECharacterDirection
\r
161 // public inner classes ----------------------------------------------
\r
164 * A family of character subsets representing the character blocks in the
\r
165 * Unicode specification, generated from Unicode Data file Blocks.txt.
\r
166 * Character blocks generally define characters used for a specific script
\r
167 * or purpose. A character is contained by at most one Unicode block.
\r
170 public static final class UnicodeBlock extends Character.Subset
\r
172 // block id corresponding to icu4c -----------------------------------
\r
177 public static final int INVALID_CODE_ID = -1;
\r
181 public static final int BASIC_LATIN_ID = 1;
\r
185 public static final int LATIN_1_SUPPLEMENT_ID = 2;
\r
189 public static final int LATIN_EXTENDED_A_ID = 3;
\r
193 public static final int LATIN_EXTENDED_B_ID = 4;
\r
197 public static final int IPA_EXTENSIONS_ID = 5;
\r
201 public static final int SPACING_MODIFIER_LETTERS_ID = 6;
\r
205 public static final int COMBINING_DIACRITICAL_MARKS_ID = 7;
\r
207 * Unicode 3.2 renames this block to "Greek and Coptic".
\r
210 public static final int GREEK_ID = 8;
\r
214 public static final int CYRILLIC_ID = 9;
\r
218 public static final int ARMENIAN_ID = 10;
\r
222 public static final int HEBREW_ID = 11;
\r
226 public static final int ARABIC_ID = 12;
\r
230 public static final int SYRIAC_ID = 13;
\r
234 public static final int THAANA_ID = 14;
\r
238 public static final int DEVANAGARI_ID = 15;
\r
242 public static final int BENGALI_ID = 16;
\r
246 public static final int GURMUKHI_ID = 17;
\r
250 public static final int GUJARATI_ID = 18;
\r
254 public static final int ORIYA_ID = 19;
\r
258 public static final int TAMIL_ID = 20;
\r
262 public static final int TELUGU_ID = 21;
\r
266 public static final int KANNADA_ID = 22;
\r
270 public static final int MALAYALAM_ID = 23;
\r
274 public static final int SINHALA_ID = 24;
\r
278 public static final int THAI_ID = 25;
\r
282 public static final int LAO_ID = 26;
\r
286 public static final int TIBETAN_ID = 27;
\r
290 public static final int MYANMAR_ID = 28;
\r
294 public static final int GEORGIAN_ID = 29;
\r
298 public static final int HANGUL_JAMO_ID = 30;
\r
302 public static final int ETHIOPIC_ID = 31;
\r
306 public static final int CHEROKEE_ID = 32;
\r
310 public static final int UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID = 33;
\r
314 public static final int OGHAM_ID = 34;
\r
318 public static final int RUNIC_ID = 35;
\r
322 public static final int KHMER_ID = 36;
\r
326 public static final int MONGOLIAN_ID = 37;
\r
330 public static final int LATIN_EXTENDED_ADDITIONAL_ID = 38;
\r
334 public static final int GREEK_EXTENDED_ID = 39;
\r
338 public static final int GENERAL_PUNCTUATION_ID = 40;
\r
342 public static final int SUPERSCRIPTS_AND_SUBSCRIPTS_ID = 41;
\r
346 public static final int CURRENCY_SYMBOLS_ID = 42;
\r
348 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
\r
352 public static final int COMBINING_MARKS_FOR_SYMBOLS_ID = 43;
\r
356 public static final int LETTERLIKE_SYMBOLS_ID = 44;
\r
360 public static final int NUMBER_FORMS_ID = 45;
\r
364 public static final int ARROWS_ID = 46;
\r
368 public static final int MATHEMATICAL_OPERATORS_ID = 47;
\r
372 public static final int MISCELLANEOUS_TECHNICAL_ID = 48;
\r
376 public static final int CONTROL_PICTURES_ID = 49;
\r
380 public static final int OPTICAL_CHARACTER_RECOGNITION_ID = 50;
\r
384 public static final int ENCLOSED_ALPHANUMERICS_ID = 51;
\r
388 public static final int BOX_DRAWING_ID = 52;
\r
392 public static final int BLOCK_ELEMENTS_ID = 53;
\r
396 public static final int GEOMETRIC_SHAPES_ID = 54;
\r
400 public static final int MISCELLANEOUS_SYMBOLS_ID = 55;
\r
404 public static final int DINGBATS_ID = 56;
\r
408 public static final int BRAILLE_PATTERNS_ID = 57;
\r
412 public static final int CJK_RADICALS_SUPPLEMENT_ID = 58;
\r
416 public static final int KANGXI_RADICALS_ID = 59;
\r
420 public static final int IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID = 60;
\r
424 public static final int CJK_SYMBOLS_AND_PUNCTUATION_ID = 61;
\r
428 public static final int HIRAGANA_ID = 62;
\r
432 public static final int KATAKANA_ID = 63;
\r
436 public static final int BOPOMOFO_ID = 64;
\r
440 public static final int HANGUL_COMPATIBILITY_JAMO_ID = 65;
\r
444 public static final int KANBUN_ID = 66;
\r
448 public static final int BOPOMOFO_EXTENDED_ID = 67;
\r
452 public static final int ENCLOSED_CJK_LETTERS_AND_MONTHS_ID = 68;
\r
456 public static final int CJK_COMPATIBILITY_ID = 69;
\r
460 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID = 70;
\r
464 public static final int CJK_UNIFIED_IDEOGRAPHS_ID = 71;
\r
468 public static final int YI_SYLLABLES_ID = 72;
\r
472 public static final int YI_RADICALS_ID = 73;
\r
476 public static final int HANGUL_SYLLABLES_ID = 74;
\r
480 public static final int HIGH_SURROGATES_ID = 75;
\r
484 public static final int HIGH_PRIVATE_USE_SURROGATES_ID = 76;
\r
488 public static final int LOW_SURROGATES_ID = 77;
\r
490 * Same as public static final int PRIVATE_USE.
\r
491 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
492 * and multiple code point ranges had this block.
\r
493 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
494 * and adds separate blocks for the supplementary PUAs.
\r
497 public static final int PRIVATE_USE_AREA_ID = 78;
\r
499 * Same as public static final int PRIVATE_USE_AREA.
\r
500 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
501 * and multiple code point ranges had this block.
\r
502 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
503 * and adds separate blocks for the supplementary PUAs.
\r
506 public static final int PRIVATE_USE_ID = PRIVATE_USE_AREA_ID;
\r
510 public static final int CJK_COMPATIBILITY_IDEOGRAPHS_ID = 79;
\r
514 public static final int ALPHABETIC_PRESENTATION_FORMS_ID = 80;
\r
518 public static final int ARABIC_PRESENTATION_FORMS_A_ID = 81;
\r
522 public static final int COMBINING_HALF_MARKS_ID = 82;
\r
526 public static final int CJK_COMPATIBILITY_FORMS_ID = 83;
\r
530 public static final int SMALL_FORM_VARIANTS_ID = 84;
\r
534 public static final int ARABIC_PRESENTATION_FORMS_B_ID = 85;
\r
538 public static final int SPECIALS_ID = 86;
\r
542 public static final int HALFWIDTH_AND_FULLWIDTH_FORMS_ID = 87;
\r
546 public static final int OLD_ITALIC_ID = 88;
\r
550 public static final int GOTHIC_ID = 89;
\r
554 public static final int DESERET_ID = 90;
\r
558 public static final int BYZANTINE_MUSICAL_SYMBOLS_ID = 91;
\r
562 public static final int MUSICAL_SYMBOLS_ID = 92;
\r
566 public static final int MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID = 93;
\r
570 public static final int CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID = 94;
\r
574 public static final int
\r
575 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID = 95;
\r
579 public static final int TAGS_ID = 96;
\r
581 // New blocks in Unicode 3.2
\r
584 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
587 public static final int CYRILLIC_SUPPLEMENTARY_ID = 97;
\r
589 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
593 public static final int CYRILLIC_SUPPLEMENT_ID = 97;
\r
597 public static final int TAGALOG_ID = 98;
\r
601 public static final int HANUNOO_ID = 99;
\r
605 public static final int BUHID_ID = 100;
\r
609 public static final int TAGBANWA_ID = 101;
\r
613 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID = 102;
\r
617 public static final int SUPPLEMENTAL_ARROWS_A_ID = 103;
\r
621 public static final int SUPPLEMENTAL_ARROWS_B_ID = 104;
\r
625 public static final int MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID = 105;
\r
629 public static final int SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID = 106;
\r
633 public static final int KATAKANA_PHONETIC_EXTENSIONS_ID = 107;
\r
637 public static final int VARIATION_SELECTORS_ID = 108;
\r
641 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID = 109;
\r
645 public static final int SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID = 110;
\r
650 public static final int LIMBU_ID = 111; /*[1900]*/
\r
654 public static final int TAI_LE_ID = 112; /*[1950]*/
\r
658 public static final int KHMER_SYMBOLS_ID = 113; /*[19E0]*/
\r
662 public static final int PHONETIC_EXTENSIONS_ID = 114; /*[1D00]*/
\r
666 public static final int MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID = 115; /*[2B00]*/
\r
670 public static final int YIJING_HEXAGRAM_SYMBOLS_ID = 116; /*[4DC0]*/
\r
674 public static final int LINEAR_B_SYLLABARY_ID = 117; /*[10000]*/
\r
678 public static final int LINEAR_B_IDEOGRAMS_ID = 118; /*[10080]*/
\r
682 public static final int AEGEAN_NUMBERS_ID = 119; /*[10100]*/
\r
686 public static final int UGARITIC_ID = 120; /*[10380]*/
\r
690 public static final int SHAVIAN_ID = 121; /*[10450]*/
\r
694 public static final int OSMANYA_ID = 122; /*[10480]*/
\r
698 public static final int CYPRIOT_SYLLABARY_ID = 123; /*[10800]*/
\r
702 public static final int TAI_XUAN_JING_SYMBOLS_ID = 124; /*[1D300]*/
\r
706 public static final int VARIATION_SELECTORS_SUPPLEMENT_ID = 125; /*[E0100]*/
\r
708 /* New blocks in Unicode 4.1 */
\r
713 public static final int ANCIENT_GREEK_MUSICAL_NOTATION_ID = 126; /*[1D200]*/
\r
718 public static final int ANCIENT_GREEK_NUMBERS_ID = 127; /*[10140]*/
\r
723 public static final int ARABIC_SUPPLEMENT_ID = 128; /*[0750]*/
\r
728 public static final int BUGINESE_ID = 129; /*[1A00]*/
\r
733 public static final int CJK_STROKES_ID = 130; /*[31C0]*/
\r
738 public static final int COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID = 131; /*[1DC0]*/
\r
743 public static final int COPTIC_ID = 132; /*[2C80]*/
\r
748 public static final int ETHIOPIC_EXTENDED_ID = 133; /*[2D80]*/
\r
753 public static final int ETHIOPIC_SUPPLEMENT_ID = 134; /*[1380]*/
\r
758 public static final int GEORGIAN_SUPPLEMENT_ID = 135; /*[2D00]*/
\r
763 public static final int GLAGOLITIC_ID = 136; /*[2C00]*/
\r
768 public static final int KHAROSHTHI_ID = 137; /*[10A00]*/
\r
773 public static final int MODIFIER_TONE_LETTERS_ID = 138; /*[A700]*/
\r
778 public static final int NEW_TAI_LUE_ID = 139; /*[1980]*/
\r
783 public static final int OLD_PERSIAN_ID = 140; /*[103A0]*/
\r
788 public static final int PHONETIC_EXTENSIONS_SUPPLEMENT_ID = 141; /*[1D80]*/
\r
793 public static final int SUPPLEMENTAL_PUNCTUATION_ID = 142; /*[2E00]*/
\r
798 public static final int SYLOTI_NAGRI_ID = 143; /*[A800]*/
\r
803 public static final int TIFINAGH_ID = 144; /*[2D30]*/
\r
808 public static final int VERTICAL_FORMS_ID = 145; /*[FE10]*/
\r
810 /* New blocks in Unicode 5.0 */
\r
815 public static final int NKO_ID = 146; /*[07C0]*/
\r
819 public static final int BALINESE_ID = 147; /*[1B00]*/
\r
823 public static final int LATIN_EXTENDED_C_ID = 148; /*[2C60]*/
\r
827 public static final int LATIN_EXTENDED_D_ID = 149; /*[A720]*/
\r
831 public static final int PHAGS_PA_ID = 150; /*[A840]*/
\r
835 public static final int PHOENICIAN_ID = 151; /*[10900]*/
\r
839 public static final int CUNEIFORM_ID = 152; /*[12000]*/
\r
843 public static final int CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID = 153; /*[12400]*/
\r
847 public static final int COUNTING_ROD_NUMERALS_ID = 154; /*[1D360]*/
\r
852 public static final int SUNDANESE_ID = 155; /* [1B80] */
\r
857 public static final int LEPCHA_ID = 156; /* [1C00] */
\r
862 public static final int OL_CHIKI_ID = 157; /* [1C50] */
\r
867 public static final int CYRILLIC_EXTENDED_A_ID = 158; /* [2DE0] */
\r
872 public static final int VAI_ID = 159; /* [A500] */
\r
877 public static final int CYRILLIC_EXTENDED_B_ID = 160; /* [A640] */
\r
882 public static final int SAURASHTRA_ID = 161; /* [A880] */
\r
887 public static final int KAYAH_LI_ID = 162; /* [A900] */
\r
892 public static final int REJANG_ID = 163; /* [A930] */
\r
897 public static final int CHAM_ID = 164; /* [AA00] */
\r
902 public static final int ANCIENT_SYMBOLS_ID = 165; /* [10190] */
\r
907 public static final int PHAISTOS_DISC_ID = 166; /* [101D0] */
\r
912 public static final int LYCIAN_ID = 167; /* [10280] */
\r
917 public static final int CARIAN_ID = 168; /* [102A0] */
\r
922 public static final int LYDIAN_ID = 169; /* [10920] */
\r
927 public static final int MAHJONG_TILES_ID = 170; /* [1F000] */
\r
932 public static final int DOMINO_TILES_ID = 171; /* [1F030] */
\r
937 public static final int COUNT = 172;
\r
939 // blocks objects ---------------------------------------------------
\r
944 public static final UnicodeBlock NO_BLOCK
\r
945 = new UnicodeBlock("NO_BLOCK", 0);
\r
950 public static final UnicodeBlock BASIC_LATIN
\r
951 = new UnicodeBlock("BASIC_LATIN", BASIC_LATIN_ID);
\r
955 public static final UnicodeBlock LATIN_1_SUPPLEMENT
\r
956 = new UnicodeBlock("LATIN_1_SUPPLEMENT", LATIN_1_SUPPLEMENT_ID);
\r
960 public static final UnicodeBlock LATIN_EXTENDED_A
\r
961 = new UnicodeBlock("LATIN_EXTENDED_A", LATIN_EXTENDED_A_ID);
\r
965 public static final UnicodeBlock LATIN_EXTENDED_B
\r
966 = new UnicodeBlock("LATIN_EXTENDED_B", LATIN_EXTENDED_B_ID);
\r
970 public static final UnicodeBlock IPA_EXTENSIONS
\r
971 = new UnicodeBlock("IPA_EXTENSIONS", IPA_EXTENSIONS_ID);
\r
975 public static final UnicodeBlock SPACING_MODIFIER_LETTERS
\r
976 = new UnicodeBlock("SPACING_MODIFIER_LETTERS", SPACING_MODIFIER_LETTERS_ID);
\r
980 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS
\r
981 = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS", COMBINING_DIACRITICAL_MARKS_ID);
\r
983 * Unicode 3.2 renames this block to "Greek and Coptic".
\r
986 public static final UnicodeBlock GREEK
\r
987 = new UnicodeBlock("GREEK", GREEK_ID);
\r
991 public static final UnicodeBlock CYRILLIC
\r
992 = new UnicodeBlock("CYRILLIC", CYRILLIC_ID);
\r
996 public static final UnicodeBlock ARMENIAN
\r
997 = new UnicodeBlock("ARMENIAN", ARMENIAN_ID);
\r
1001 public static final UnicodeBlock HEBREW
\r
1002 = new UnicodeBlock("HEBREW", HEBREW_ID);
\r
1004 * @stable ICU 2.4
\r
1006 public static final UnicodeBlock ARABIC
\r
1007 = new UnicodeBlock("ARABIC", ARABIC_ID);
\r
1009 * @stable ICU 2.4
\r
1011 public static final UnicodeBlock SYRIAC
\r
1012 = new UnicodeBlock("SYRIAC", SYRIAC_ID);
\r
1014 * @stable ICU 2.4
\r
1016 public static final UnicodeBlock THAANA
\r
1017 = new UnicodeBlock("THAANA", THAANA_ID);
\r
1019 * @stable ICU 2.4
\r
1021 public static final UnicodeBlock DEVANAGARI
\r
1022 = new UnicodeBlock("DEVANAGARI", DEVANAGARI_ID);
\r
1024 * @stable ICU 2.4
\r
1026 public static final UnicodeBlock BENGALI
\r
1027 = new UnicodeBlock("BENGALI", BENGALI_ID);
\r
1029 * @stable ICU 2.4
\r
1031 public static final UnicodeBlock GURMUKHI
\r
1032 = new UnicodeBlock("GURMUKHI", GURMUKHI_ID);
\r
1034 * @stable ICU 2.4
\r
1036 public static final UnicodeBlock GUJARATI
\r
1037 = new UnicodeBlock("GUJARATI", GUJARATI_ID);
\r
1039 * @stable ICU 2.4
\r
1041 public static final UnicodeBlock ORIYA
\r
1042 = new UnicodeBlock("ORIYA", ORIYA_ID);
\r
1044 * @stable ICU 2.4
\r
1046 public static final UnicodeBlock TAMIL
\r
1047 = new UnicodeBlock("TAMIL", TAMIL_ID);
\r
1049 * @stable ICU 2.4
\r
1051 public static final UnicodeBlock TELUGU
\r
1052 = new UnicodeBlock("TELUGU", TELUGU_ID);
\r
1054 * @stable ICU 2.4
\r
1056 public static final UnicodeBlock KANNADA
\r
1057 = new UnicodeBlock("KANNADA", KANNADA_ID);
\r
1059 * @stable ICU 2.4
\r
1061 public static final UnicodeBlock MALAYALAM
\r
1062 = new UnicodeBlock("MALAYALAM", MALAYALAM_ID);
\r
1064 * @stable ICU 2.4
\r
1066 public static final UnicodeBlock SINHALA
\r
1067 = new UnicodeBlock("SINHALA", SINHALA_ID);
\r
1069 * @stable ICU 2.4
\r
1071 public static final UnicodeBlock THAI
\r
1072 = new UnicodeBlock("THAI", THAI_ID);
\r
1074 * @stable ICU 2.4
\r
1076 public static final UnicodeBlock LAO
\r
1077 = new UnicodeBlock("LAO", LAO_ID);
\r
1079 * @stable ICU 2.4
\r
1081 public static final UnicodeBlock TIBETAN
\r
1082 = new UnicodeBlock("TIBETAN", TIBETAN_ID);
\r
1084 * @stable ICU 2.4
\r
1086 public static final UnicodeBlock MYANMAR
\r
1087 = new UnicodeBlock("MYANMAR", MYANMAR_ID);
\r
1089 * @stable ICU 2.4
\r
1091 public static final UnicodeBlock GEORGIAN
\r
1092 = new UnicodeBlock("GEORGIAN", GEORGIAN_ID);
\r
1094 * @stable ICU 2.4
\r
1096 public static final UnicodeBlock HANGUL_JAMO
\r
1097 = new UnicodeBlock("HANGUL_JAMO", HANGUL_JAMO_ID);
\r
1099 * @stable ICU 2.4
\r
1101 public static final UnicodeBlock ETHIOPIC
\r
1102 = new UnicodeBlock("ETHIOPIC", ETHIOPIC_ID);
\r
1104 * @stable ICU 2.4
\r
1106 public static final UnicodeBlock CHEROKEE
\r
1107 = new UnicodeBlock("CHEROKEE", CHEROKEE_ID);
\r
1109 * @stable ICU 2.4
\r
1111 public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
\r
1112 = new UnicodeBlock("UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_ID);
\r
1114 * @stable ICU 2.4
\r
1116 public static final UnicodeBlock OGHAM
\r
1117 = new UnicodeBlock("OGHAM", OGHAM_ID);
\r
1119 * @stable ICU 2.4
\r
1121 public static final UnicodeBlock RUNIC
\r
1122 = new UnicodeBlock("RUNIC", RUNIC_ID);
\r
1124 * @stable ICU 2.4
\r
1126 public static final UnicodeBlock KHMER
\r
1127 = new UnicodeBlock("KHMER", KHMER_ID);
\r
1129 * @stable ICU 2.4
\r
1131 public static final UnicodeBlock MONGOLIAN
\r
1132 = new UnicodeBlock("MONGOLIAN", MONGOLIAN_ID);
\r
1134 * @stable ICU 2.4
\r
1136 public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL
\r
1137 = new UnicodeBlock("LATIN_EXTENDED_ADDITIONAL", LATIN_EXTENDED_ADDITIONAL_ID);
\r
1139 * @stable ICU 2.4
\r
1141 public static final UnicodeBlock GREEK_EXTENDED
\r
1142 = new UnicodeBlock("GREEK_EXTENDED", GREEK_EXTENDED_ID);
\r
1144 * @stable ICU 2.4
\r
1146 public static final UnicodeBlock GENERAL_PUNCTUATION
\r
1147 = new UnicodeBlock("GENERAL_PUNCTUATION", GENERAL_PUNCTUATION_ID);
\r
1149 * @stable ICU 2.4
\r
1151 public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS
\r
1152 = new UnicodeBlock("SUPERSCRIPTS_AND_SUBSCRIPTS", SUPERSCRIPTS_AND_SUBSCRIPTS_ID);
\r
1154 * @stable ICU 2.4
\r
1156 public static final UnicodeBlock CURRENCY_SYMBOLS
\r
1157 = new UnicodeBlock("CURRENCY_SYMBOLS", CURRENCY_SYMBOLS_ID);
\r
1159 * Unicode 3.2 renames this block to "Combining Diacritical Marks for
\r
1163 public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS
\r
1164 = new UnicodeBlock("COMBINING_MARKS_FOR_SYMBOLS", COMBINING_MARKS_FOR_SYMBOLS_ID);
\r
1166 * @stable ICU 2.4
\r
1168 public static final UnicodeBlock LETTERLIKE_SYMBOLS
\r
1169 = new UnicodeBlock("LETTERLIKE_SYMBOLS", LETTERLIKE_SYMBOLS_ID);
\r
1171 * @stable ICU 2.4
\r
1173 public static final UnicodeBlock NUMBER_FORMS
\r
1174 = new UnicodeBlock("NUMBER_FORMS", NUMBER_FORMS_ID);
\r
1176 * @stable ICU 2.4
\r
1178 public static final UnicodeBlock ARROWS
\r
1179 = new UnicodeBlock("ARROWS", ARROWS_ID);
\r
1181 * @stable ICU 2.4
\r
1183 public static final UnicodeBlock MATHEMATICAL_OPERATORS
\r
1184 = new UnicodeBlock("MATHEMATICAL_OPERATORS", MATHEMATICAL_OPERATORS_ID);
\r
1186 * @stable ICU 2.4
\r
1188 public static final UnicodeBlock MISCELLANEOUS_TECHNICAL
\r
1189 = new UnicodeBlock("MISCELLANEOUS_TECHNICAL", MISCELLANEOUS_TECHNICAL_ID);
\r
1191 * @stable ICU 2.4
\r
1193 public static final UnicodeBlock CONTROL_PICTURES
\r
1194 = new UnicodeBlock("CONTROL_PICTURES", CONTROL_PICTURES_ID);
\r
1196 * @stable ICU 2.4
\r
1198 public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION
\r
1199 = new UnicodeBlock("OPTICAL_CHARACTER_RECOGNITION", OPTICAL_CHARACTER_RECOGNITION_ID);
\r
1201 * @stable ICU 2.4
\r
1203 public static final UnicodeBlock ENCLOSED_ALPHANUMERICS
\r
1204 = new UnicodeBlock("ENCLOSED_ALPHANUMERICS", ENCLOSED_ALPHANUMERICS_ID);
\r
1206 * @stable ICU 2.4
\r
1208 public static final UnicodeBlock BOX_DRAWING
\r
1209 = new UnicodeBlock("BOX_DRAWING", BOX_DRAWING_ID);
\r
1211 * @stable ICU 2.4
\r
1213 public static final UnicodeBlock BLOCK_ELEMENTS
\r
1214 = new UnicodeBlock("BLOCK_ELEMENTS", BLOCK_ELEMENTS_ID);
\r
1216 * @stable ICU 2.4
\r
1218 public static final UnicodeBlock GEOMETRIC_SHAPES
\r
1219 = new UnicodeBlock("GEOMETRIC_SHAPES", GEOMETRIC_SHAPES_ID);
\r
1221 * @stable ICU 2.4
\r
1223 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS
\r
1224 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS", MISCELLANEOUS_SYMBOLS_ID);
\r
1226 * @stable ICU 2.4
\r
1228 public static final UnicodeBlock DINGBATS
\r
1229 = new UnicodeBlock("DINGBATS", DINGBATS_ID);
\r
1231 * @stable ICU 2.4
\r
1233 public static final UnicodeBlock BRAILLE_PATTERNS
\r
1234 = new UnicodeBlock("BRAILLE_PATTERNS", BRAILLE_PATTERNS_ID);
\r
1236 * @stable ICU 2.4
\r
1238 public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT
\r
1239 = new UnicodeBlock("CJK_RADICALS_SUPPLEMENT", CJK_RADICALS_SUPPLEMENT_ID);
\r
1241 * @stable ICU 2.4
\r
1243 public static final UnicodeBlock KANGXI_RADICALS
\r
1244 = new UnicodeBlock("KANGXI_RADICALS", KANGXI_RADICALS_ID);
\r
1246 * @stable ICU 2.4
\r
1248 public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS
\r
1249 = new UnicodeBlock("IDEOGRAPHIC_DESCRIPTION_CHARACTERS", IDEOGRAPHIC_DESCRIPTION_CHARACTERS_ID);
\r
1251 * @stable ICU 2.4
\r
1253 public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION
\r
1254 = new UnicodeBlock("CJK_SYMBOLS_AND_PUNCTUATION", CJK_SYMBOLS_AND_PUNCTUATION_ID);
\r
1256 * @stable ICU 2.4
\r
1258 public static final UnicodeBlock HIRAGANA
\r
1259 = new UnicodeBlock("HIRAGANA", HIRAGANA_ID);
\r
1261 * @stable ICU 2.4
\r
1263 public static final UnicodeBlock KATAKANA
\r
1264 = new UnicodeBlock("KATAKANA", KATAKANA_ID);
\r
1266 * @stable ICU 2.4
\r
1268 public static final UnicodeBlock BOPOMOFO
\r
1269 = new UnicodeBlock("BOPOMOFO", BOPOMOFO_ID);
\r
1271 * @stable ICU 2.4
\r
1273 public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO
\r
1274 = new UnicodeBlock("HANGUL_COMPATIBILITY_JAMO", HANGUL_COMPATIBILITY_JAMO_ID);
\r
1276 * @stable ICU 2.4
\r
1278 public static final UnicodeBlock KANBUN
\r
1279 = new UnicodeBlock("KANBUN", KANBUN_ID);
\r
1281 * @stable ICU 2.4
\r
1283 public static final UnicodeBlock BOPOMOFO_EXTENDED
\r
1284 = new UnicodeBlock("BOPOMOFO_EXTENDED", BOPOMOFO_EXTENDED_ID);
\r
1286 * @stable ICU 2.4
\r
1288 public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS
\r
1289 = new UnicodeBlock("ENCLOSED_CJK_LETTERS_AND_MONTHS", ENCLOSED_CJK_LETTERS_AND_MONTHS_ID);
\r
1291 * @stable ICU 2.4
\r
1293 public static final UnicodeBlock CJK_COMPATIBILITY
\r
1294 = new UnicodeBlock("CJK_COMPATIBILITY", CJK_COMPATIBILITY_ID);
\r
1296 * @stable ICU 2.4
\r
1298 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
\r
1299 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A_ID);
\r
1301 * @stable ICU 2.4
\r
1303 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS
\r
1304 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS", CJK_UNIFIED_IDEOGRAPHS_ID);
\r
1306 * @stable ICU 2.4
\r
1308 public static final UnicodeBlock YI_SYLLABLES
\r
1309 = new UnicodeBlock("YI_SYLLABLES", YI_SYLLABLES_ID);
\r
1311 * @stable ICU 2.4
\r
1313 public static final UnicodeBlock YI_RADICALS
\r
1314 = new UnicodeBlock("YI_RADICALS", YI_RADICALS_ID);
\r
1316 * @stable ICU 2.4
\r
1318 public static final UnicodeBlock HANGUL_SYLLABLES
\r
1319 = new UnicodeBlock("HANGUL_SYLLABLES", HANGUL_SYLLABLES_ID);
\r
1321 * @stable ICU 2.4
\r
1323 public static final UnicodeBlock HIGH_SURROGATES
\r
1324 = new UnicodeBlock("HIGH_SURROGATES", HIGH_SURROGATES_ID);
\r
1326 * @stable ICU 2.4
\r
1328 public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES
\r
1329 = new UnicodeBlock("HIGH_PRIVATE_USE_SURROGATES", HIGH_PRIVATE_USE_SURROGATES_ID);
\r
1331 * @stable ICU 2.4
\r
1333 public static final UnicodeBlock LOW_SURROGATES
\r
1334 = new UnicodeBlock("LOW_SURROGATES", LOW_SURROGATES_ID);
\r
1336 * Same as public static final int PRIVATE_USE.
\r
1337 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
1338 * and multiple code point ranges had this block.
\r
1339 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
1340 * and adds separate blocks for the supplementary PUAs.
\r
1343 public static final UnicodeBlock PRIVATE_USE_AREA
\r
1344 = new UnicodeBlock("PRIVATE_USE_AREA", 78);
\r
1346 * Same as public static final int PRIVATE_USE_AREA.
\r
1347 * Until Unicode 3.1.1; the corresponding block name was "Private Use";
\r
1348 * and multiple code point ranges had this block.
\r
1349 * Unicode 3.2 renames the block for the BMP PUA to "Private Use Area"
\r
1350 * and adds separate blocks for the supplementary PUAs.
\r
1353 public static final UnicodeBlock PRIVATE_USE
\r
1354 = PRIVATE_USE_AREA;
\r
1356 * @stable ICU 2.4
\r
1358 public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS
\r
1359 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS", CJK_COMPATIBILITY_IDEOGRAPHS_ID);
\r
1361 * @stable ICU 2.4
\r
1363 public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS
\r
1364 = new UnicodeBlock("ALPHABETIC_PRESENTATION_FORMS", ALPHABETIC_PRESENTATION_FORMS_ID);
\r
1366 * @stable ICU 2.4
\r
1368 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A
\r
1369 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_A", ARABIC_PRESENTATION_FORMS_A_ID);
\r
1371 * @stable ICU 2.4
\r
1373 public static final UnicodeBlock COMBINING_HALF_MARKS
\r
1374 = new UnicodeBlock("COMBINING_HALF_MARKS", COMBINING_HALF_MARKS_ID);
\r
1376 * @stable ICU 2.4
\r
1378 public static final UnicodeBlock CJK_COMPATIBILITY_FORMS
\r
1379 = new UnicodeBlock("CJK_COMPATIBILITY_FORMS", CJK_COMPATIBILITY_FORMS_ID);
\r
1381 * @stable ICU 2.4
\r
1383 public static final UnicodeBlock SMALL_FORM_VARIANTS
\r
1384 = new UnicodeBlock("SMALL_FORM_VARIANTS", SMALL_FORM_VARIANTS_ID);
\r
1386 * @stable ICU 2.4
\r
1388 public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B
\r
1389 = new UnicodeBlock("ARABIC_PRESENTATION_FORMS_B", ARABIC_PRESENTATION_FORMS_B_ID);
\r
1391 * @stable ICU 2.4
\r
1393 public static final UnicodeBlock SPECIALS
\r
1394 = new UnicodeBlock("SPECIALS", SPECIALS_ID);
\r
1396 * @stable ICU 2.4
\r
1398 public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS
\r
1399 = new UnicodeBlock("HALFWIDTH_AND_FULLWIDTH_FORMS", HALFWIDTH_AND_FULLWIDTH_FORMS_ID);
\r
1401 * @stable ICU 2.4
\r
1403 public static final UnicodeBlock OLD_ITALIC
\r
1404 = new UnicodeBlock("OLD_ITALIC", OLD_ITALIC_ID);
\r
1406 * @stable ICU 2.4
\r
1408 public static final UnicodeBlock GOTHIC
\r
1409 = new UnicodeBlock("GOTHIC", GOTHIC_ID);
\r
1411 * @stable ICU 2.4
\r
1413 public static final UnicodeBlock DESERET
\r
1414 = new UnicodeBlock("DESERET", DESERET_ID);
\r
1416 * @stable ICU 2.4
\r
1418 public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS
\r
1419 = new UnicodeBlock("BYZANTINE_MUSICAL_SYMBOLS", BYZANTINE_MUSICAL_SYMBOLS_ID);
\r
1421 * @stable ICU 2.4
\r
1423 public static final UnicodeBlock MUSICAL_SYMBOLS
\r
1424 = new UnicodeBlock("MUSICAL_SYMBOLS", MUSICAL_SYMBOLS_ID);
\r
1426 * @stable ICU 2.4
\r
1428 public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS
\r
1429 = new UnicodeBlock("MATHEMATICAL_ALPHANUMERIC_SYMBOLS", MATHEMATICAL_ALPHANUMERIC_SYMBOLS_ID);
\r
1431 * @stable ICU 2.4
\r
1433 public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
\r
1434 = new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B_ID);
\r
1436 * @stable ICU 2.4
\r
1438 public static final UnicodeBlock
\r
1439 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT
\r
1440 = new UnicodeBlock("CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT_ID);
\r
1442 * @stable ICU 2.4
\r
1444 public static final UnicodeBlock TAGS
\r
1445 = new UnicodeBlock("TAGS", TAGS_ID);
\r
1447 // New blocks in Unicode 3.2
\r
1450 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
1451 * @stable ICU 2.4
\r
1453 public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY
\r
1454 = new UnicodeBlock("CYRILLIC_SUPPLEMENTARY", CYRILLIC_SUPPLEMENTARY_ID);
\r
1456 * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement".
\r
1459 public static final UnicodeBlock CYRILLIC_SUPPLEMENT
\r
1460 = new UnicodeBlock("CYRILLIC_SUPPLEMENT", CYRILLIC_SUPPLEMENT_ID);
\r
1462 * @stable ICU 2.4
\r
1464 public static final UnicodeBlock TAGALOG
\r
1465 = new UnicodeBlock("TAGALOG", TAGALOG_ID);
\r
1467 * @stable ICU 2.4
\r
1469 public static final UnicodeBlock HANUNOO
\r
1470 = new UnicodeBlock("HANUNOO", HANUNOO_ID);
\r
1472 * @stable ICU 2.4
\r
1474 public static final UnicodeBlock BUHID
\r
1475 = new UnicodeBlock("BUHID", BUHID_ID);
\r
1477 * @stable ICU 2.4
\r
1479 public static final UnicodeBlock TAGBANWA
\r
1480 = new UnicodeBlock("TAGBANWA", TAGBANWA_ID);
\r
1482 * @stable ICU 2.4
\r
1484 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A
\r
1485 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A_ID);
\r
1487 * @stable ICU 2.4
\r
1489 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A
\r
1490 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_A", SUPPLEMENTAL_ARROWS_A_ID);
\r
1492 * @stable ICU 2.4
\r
1494 public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B
\r
1495 = new UnicodeBlock("SUPPLEMENTAL_ARROWS_B", SUPPLEMENTAL_ARROWS_B_ID);
\r
1497 * @stable ICU 2.4
\r
1499 public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B
\r
1500 = new UnicodeBlock("MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B_ID);
\r
1502 * @stable ICU 2.4
\r
1504 public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS
\r
1505 = new UnicodeBlock("SUPPLEMENTAL_MATHEMATICAL_OPERATORS", SUPPLEMENTAL_MATHEMATICAL_OPERATORS_ID);
\r
1507 * @stable ICU 2.4
\r
1509 public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS
\r
1510 = new UnicodeBlock("KATAKANA_PHONETIC_EXTENSIONS", KATAKANA_PHONETIC_EXTENSIONS_ID);
\r
1512 * @stable ICU 2.4
\r
1514 public static final UnicodeBlock VARIATION_SELECTORS
\r
1515 = new UnicodeBlock("VARIATION_SELECTORS", VARIATION_SELECTORS_ID);
\r
1517 * @stable ICU 2.4
\r
1519 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A
\r
1520 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_A", SUPPLEMENTARY_PRIVATE_USE_AREA_A_ID);
\r
1522 * @stable ICU 2.4
\r
1524 public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B
\r
1525 = new UnicodeBlock("SUPPLEMENTARY_PRIVATE_USE_AREA_B", SUPPLEMENTARY_PRIVATE_USE_AREA_B_ID);
\r
1528 * @stable ICU 2.6
\r
1530 public static final UnicodeBlock LIMBU
\r
1531 = new UnicodeBlock("LIMBU", LIMBU_ID);
\r
1533 * @stable ICU 2.6
\r
1535 public static final UnicodeBlock TAI_LE
\r
1536 = new UnicodeBlock("TAI_LE", TAI_LE_ID);
\r
1538 * @stable ICU 2.6
\r
1540 public static final UnicodeBlock KHMER_SYMBOLS
\r
1541 = new UnicodeBlock("KHMER_SYMBOLS", KHMER_SYMBOLS_ID);
\r
1544 * @stable ICU 2.6
\r
1546 public static final UnicodeBlock PHONETIC_EXTENSIONS
\r
1547 = new UnicodeBlock("PHONETIC_EXTENSIONS", PHONETIC_EXTENSIONS_ID);
\r
1550 * @stable ICU 2.6
\r
1552 public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS
\r
1553 = new UnicodeBlock("MISCELLANEOUS_SYMBOLS_AND_ARROWS", MISCELLANEOUS_SYMBOLS_AND_ARROWS_ID);
\r
1555 * @stable ICU 2.6
\r
1557 public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS
\r
1558 = new UnicodeBlock("YIJING_HEXAGRAM_SYMBOLS", YIJING_HEXAGRAM_SYMBOLS_ID);
\r
1560 * @stable ICU 2.6
\r
1562 public static final UnicodeBlock LINEAR_B_SYLLABARY
\r
1563 = new UnicodeBlock("LINEAR_B_SYLLABARY", LINEAR_B_SYLLABARY_ID);
\r
1565 * @stable ICU 2.6
\r
1567 public static final UnicodeBlock LINEAR_B_IDEOGRAMS
\r
1568 = new UnicodeBlock("LINEAR_B_IDEOGRAMS", LINEAR_B_IDEOGRAMS_ID);
\r
1570 * @stable ICU 2.6
\r
1572 public static final UnicodeBlock AEGEAN_NUMBERS
\r
1573 = new UnicodeBlock("AEGEAN_NUMBERS", AEGEAN_NUMBERS_ID);
\r
1575 * @stable ICU 2.6
\r
1577 public static final UnicodeBlock UGARITIC
\r
1578 = new UnicodeBlock("UGARITIC", UGARITIC_ID);
\r
1580 * @stable ICU 2.6
\r
1582 public static final UnicodeBlock SHAVIAN
\r
1583 = new UnicodeBlock("SHAVIAN", SHAVIAN_ID);
\r
1585 * @stable ICU 2.6
\r
1587 public static final UnicodeBlock OSMANYA
\r
1588 = new UnicodeBlock("OSMANYA", OSMANYA_ID);
\r
1590 * @stable ICU 2.6
\r
1592 public static final UnicodeBlock CYPRIOT_SYLLABARY
\r
1593 = new UnicodeBlock("CYPRIOT_SYLLABARY", CYPRIOT_SYLLABARY_ID);
\r
1595 * @stable ICU 2.6
\r
1597 public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS
\r
1598 = new UnicodeBlock("TAI_XUAN_JING_SYMBOLS", TAI_XUAN_JING_SYMBOLS_ID);
\r
1601 * @stable ICU 2.6
\r
1603 public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT
\r
1604 = new UnicodeBlock("VARIATION_SELECTORS_SUPPLEMENT", VARIATION_SELECTORS_SUPPLEMENT_ID);
\r
1606 /* New blocks in Unicode 4.1 */
\r
1611 public static final UnicodeBlock ANCIENT_GREEK_MUSICAL_NOTATION = new UnicodeBlock("ANCIENT_GREEK_MUSICAL_NOTATION", ANCIENT_GREEK_MUSICAL_NOTATION_ID); /*[1D200]*/
\r
1616 public static final UnicodeBlock ANCIENT_GREEK_NUMBERS = new UnicodeBlock("ANCIENT_GREEK_NUMBERS", ANCIENT_GREEK_NUMBERS_ID); /*[10140]*/
\r
1621 public static final UnicodeBlock ARABIC_SUPPLEMENT = new UnicodeBlock("ARABIC_SUPPLEMENT", ARABIC_SUPPLEMENT_ID); /*[0750]*/
\r
1626 public static final UnicodeBlock BUGINESE = new UnicodeBlock("BUGINESE", BUGINESE_ID); /*[1A00]*/
\r
1631 public static final UnicodeBlock CJK_STROKES = new UnicodeBlock("CJK_STROKES", CJK_STROKES_ID); /*[31C0]*/
\r
1636 public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_SUPPLEMENT = new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_SUPPLEMENT", COMBINING_DIACRITICAL_MARKS_SUPPLEMENT_ID); /*[1DC0]*/
\r
1641 public static final UnicodeBlock COPTIC = new UnicodeBlock("COPTIC", COPTIC_ID); /*[2C80]*/
\r
1646 public static final UnicodeBlock ETHIOPIC_EXTENDED = new UnicodeBlock("ETHIOPIC_EXTENDED", ETHIOPIC_EXTENDED_ID); /*[2D80]*/
\r
1651 public static final UnicodeBlock ETHIOPIC_SUPPLEMENT = new UnicodeBlock("ETHIOPIC_SUPPLEMENT", ETHIOPIC_SUPPLEMENT_ID); /*[1380]*/
\r
1656 public static final UnicodeBlock GEORGIAN_SUPPLEMENT = new UnicodeBlock("GEORGIAN_SUPPLEMENT", GEORGIAN_SUPPLEMENT_ID); /*[2D00]*/
\r
1661 public static final UnicodeBlock GLAGOLITIC = new UnicodeBlock("GLAGOLITIC", GLAGOLITIC_ID); /*[2C00]*/
\r
1666 public static final UnicodeBlock KHAROSHTHI = new UnicodeBlock("KHAROSHTHI", KHAROSHTHI_ID); /*[10A00]*/
\r
1671 public static final UnicodeBlock MODIFIER_TONE_LETTERS = new UnicodeBlock("MODIFIER_TONE_LETTERS", MODIFIER_TONE_LETTERS_ID); /*[A700]*/
\r
1676 public static final UnicodeBlock NEW_TAI_LUE = new UnicodeBlock("NEW_TAI_LUE", NEW_TAI_LUE_ID); /*[1980]*/
\r
1681 public static final UnicodeBlock OLD_PERSIAN = new UnicodeBlock("OLD_PERSIAN", OLD_PERSIAN_ID); /*[103A0]*/
\r
1686 public static final UnicodeBlock PHONETIC_EXTENSIONS_SUPPLEMENT = new UnicodeBlock("PHONETIC_EXTENSIONS_SUPPLEMENT", PHONETIC_EXTENSIONS_SUPPLEMENT_ID); /*[1D80]*/
\r
1691 public static final UnicodeBlock SUPPLEMENTAL_PUNCTUATION = new UnicodeBlock("SUPPLEMENTAL_PUNCTUATION", SUPPLEMENTAL_PUNCTUATION_ID); /*[2E00]*/
\r
1696 public static final UnicodeBlock SYLOTI_NAGRI = new UnicodeBlock("SYLOTI_NAGRI", SYLOTI_NAGRI_ID); /*[A800]*/
\r
1701 public static final UnicodeBlock TIFINAGH = new UnicodeBlock("TIFINAGH", TIFINAGH_ID); /*[2D30]*/
\r
1706 public static final UnicodeBlock VERTICAL_FORMS = new UnicodeBlock("VERTICAL_FORMS", VERTICAL_FORMS_ID); /*[FE10]*/
\r
1711 public static final UnicodeBlock NKO = new UnicodeBlock("NKO", NKO_ID); /*[07C0]*/
\r
1715 public static final UnicodeBlock BALINESE = new UnicodeBlock("BALINESE", BALINESE_ID); /*[1B00]*/
\r
1719 public static final UnicodeBlock LATIN_EXTENDED_C = new UnicodeBlock("LATIN_EXTENDED_C", LATIN_EXTENDED_C_ID); /*[2C60]*/
\r
1721 * @stable ICU 3.6
\r
1723 public static final UnicodeBlock LATIN_EXTENDED_D = new UnicodeBlock("LATIN_EXTENDED_D", LATIN_EXTENDED_D_ID); /*[A720]*/
\r
1727 public static final UnicodeBlock PHAGS_PA = new UnicodeBlock("PHAGS_PA", PHAGS_PA_ID); /*[A840]*/
\r
1731 public static final UnicodeBlock PHOENICIAN = new UnicodeBlock("PHOENICIAN", PHOENICIAN_ID); /*[10900]*/
\r
1735 public static final UnicodeBlock CUNEIFORM = new UnicodeBlock("CUNEIFORM", CUNEIFORM_ID); /*[12000]*/
\r
1739 public static final UnicodeBlock CUNEIFORM_NUMBERS_AND_PUNCTUATION = new UnicodeBlock("CUNEIFORM_NUMBERS_AND_PUNCTUATION", CUNEIFORM_NUMBERS_AND_PUNCTUATION_ID); /*[12400]*/
\r
1743 public static final UnicodeBlock COUNTING_ROD_NUMERALS = new UnicodeBlock("COUNTING_ROD_NUMERALS", COUNTING_ROD_NUMERALS_ID); /*[1D360]*/
\r
1748 public static final UnicodeBlock SUNDANESE = new UnicodeBlock("SUNDANESE", SUNDANESE_ID); /* [1B80] */
\r
1753 public static final UnicodeBlock LEPCHA = new UnicodeBlock("LEPCHA", LEPCHA_ID); /* [1C00] */
\r
1758 public static final UnicodeBlock OL_CHIKI = new UnicodeBlock("OL_CHIKI", OL_CHIKI_ID); /* [1C50] */
\r
1763 public static final UnicodeBlock CYRILLIC_EXTENDED_A = new UnicodeBlock("CYRILLIC_EXTENDED_A", CYRILLIC_EXTENDED_A_ID); /* [2DE0] */
\r
1768 public static final UnicodeBlock VAI = new UnicodeBlock("VAI", VAI_ID); /* [A500] */
\r
1773 public static final UnicodeBlock CYRILLIC_EXTENDED_B = new UnicodeBlock("CYRILLIC_EXTENDED_B", CYRILLIC_EXTENDED_B_ID); /* [A640] */
\r
1778 public static final UnicodeBlock SAURASHTRA = new UnicodeBlock("SAURASHTRA", SAURASHTRA_ID); /* [A880] */
\r
1783 public static final UnicodeBlock KAYAH_LI = new UnicodeBlock("KAYAH_LI", KAYAH_LI_ID); /* [A900] */
\r
1788 public static final UnicodeBlock REJANG = new UnicodeBlock("REJANG", REJANG_ID); /* [A930] */
\r
1793 public static final UnicodeBlock CHAM = new UnicodeBlock("CHAM", CHAM_ID); /* [AA00] */
\r
1798 public static final UnicodeBlock ANCIENT_SYMBOLS = new UnicodeBlock("ANCIENT_SYMBOLS", ANCIENT_SYMBOLS_ID); /* [10190] */
\r
1803 public static final UnicodeBlock PHAISTOS_DISC = new UnicodeBlock("PHAISTOS_DISC", PHAISTOS_DISC_ID); /* [101D0] */
\r
1808 public static final UnicodeBlock LYCIAN = new UnicodeBlock("LYCIAN", LYCIAN_ID); /* [10280] */
\r
1813 public static final UnicodeBlock CARIAN = new UnicodeBlock("CARIAN", CARIAN_ID); /* [102A0] */
\r
1818 public static final UnicodeBlock LYDIAN = new UnicodeBlock("LYDIAN", LYDIAN_ID); /* [10920] */
\r
1823 public static final UnicodeBlock MAHJONG_TILES = new UnicodeBlock("MAHJONG_TILES", MAHJONG_TILES_ID); /* [1F000] */
\r
1828 public static final UnicodeBlock DOMINO_TILES = new UnicodeBlock("DOMINO_TILES", DOMINO_TILES_ID); /* [1F030] */
\r
1830 * @stable ICU 2.4
\r
1832 public static final UnicodeBlock INVALID_CODE
\r
1833 = new UnicodeBlock("INVALID_CODE", INVALID_CODE_ID);
\r
1835 // public methods --------------------------------------------------
\r
1838 * Gets the only instance of the UnicodeBlock with the argument ID.
\r
1839 * If no such ID exists, a INVALID_CODE UnicodeBlock will be returned.
\r
1840 * @param id UnicodeBlock ID
\r
1841 * @return the only instance of the UnicodeBlock with the argument ID
\r
1842 * if it exists, otherwise a INVALID_CODE UnicodeBlock will be
\r
1846 public static UnicodeBlock getInstance(int id)
\r
1848 if (id >= 0 && id < BLOCKS_.length) {
\r
1849 return BLOCKS_[id];
\r
1851 return INVALID_CODE;
\r
1855 * Returns the Unicode allocation block that contains the code point,
\r
1856 * or null if the code point is not a member of a defined block.
\r
1857 * @param ch code point to be tested
\r
1858 * @return the Unicode allocation block that contains the code point
\r
1861 public static UnicodeBlock of(int ch)
\r
1863 if (ch > MAX_VALUE) {
\r
1864 return INVALID_CODE;
\r
1867 return UnicodeBlock.getInstance((PROPERTY_.getAdditional(ch, 0)
\r
1868 & BLOCK_MASK_) >> BLOCK_SHIFT_);
\r
1872 * Internal function returning of(ch).getID().
\r
1875 * @return numeric block value
\r
1878 static int idOf(int ch) {
\r
1879 if (ch < 0 || ch > MAX_VALUE) {
\r
1883 return (PROPERTY_.getAdditional(ch, 0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
\r
1887 * Cover the JDK 1.5 API. Return the Unicode block with the
\r
1888 * given name. <br/><b>Note</b>: Unlike JDK 1.5, this only matches
\r
1889 * against the official UCD name and the Java block name
\r
1890 * (ignoring case).
\r
1891 * @param blockName the name of the block to match
\r
1892 * @return the UnicodeBlock with that name
\r
1893 * @throws IllegalArgumentException if the blockName could not be matched
\r
1896 public static final UnicodeBlock forName(String blockName) {
\r
1898 if (mref != null) {
\r
1899 m = (Map)mref.get();
\r
1902 m = new HashMap(BLOCKS_.length);
\r
1903 for (int i = 0; i < BLOCKS_.length; ++i) {
\r
1904 UnicodeBlock b = BLOCKS_[i];
\r
1905 String name = trimBlockName(getPropertyValueName(UProperty.BLOCK, b.getID(), UProperty.NameChoice.LONG));
\r
1908 mref = new SoftReference(m);
\r
1910 UnicodeBlock b = (UnicodeBlock)m.get(trimBlockName(blockName));
\r
1912 throw new IllegalArgumentException();
\r
1916 private static SoftReference mref;
\r
1918 private static String trimBlockName(String name) {
\r
1919 String upper = name.toUpperCase();
\r
1920 StringBuffer result = new StringBuffer(upper.length());
\r
1921 for (int i = 0; i < upper.length(); i++) {
\r
1922 char c = upper.charAt(i);
\r
1923 if (c != ' ' && c != '_' && c != '-') {
\r
1927 return result.toString();
\r
1931 * Returns the type ID of this Unicode block
\r
1932 * @return integer type ID of this Unicode block
\r
1935 public int getID()
\r
1940 // private data members ---------------------------------------------
\r
1943 * Array of UnicodeBlocks, for easy access in getInstance(int)
\r
1945 private final static UnicodeBlock BLOCKS_[] = {
\r
1946 NO_BLOCK, BASIC_LATIN,
\r
1947 LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A,
\r
1948 LATIN_EXTENDED_B, IPA_EXTENSIONS,
\r
1949 SPACING_MODIFIER_LETTERS, COMBINING_DIACRITICAL_MARKS,
\r
1953 THAANA, DEVANAGARI,
\r
1954 BENGALI, GURMUKHI,
\r
1957 KANNADA, MALAYALAM,
\r
1960 MYANMAR, GEORGIAN,
\r
1961 HANGUL_JAMO, ETHIOPIC,
\r
1962 CHEROKEE, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
\r
1964 KHMER, MONGOLIAN,
\r
1965 LATIN_EXTENDED_ADDITIONAL, GREEK_EXTENDED,
\r
1966 GENERAL_PUNCTUATION, SUPERSCRIPTS_AND_SUBSCRIPTS,
\r
1967 CURRENCY_SYMBOLS, COMBINING_MARKS_FOR_SYMBOLS,
\r
1968 LETTERLIKE_SYMBOLS, NUMBER_FORMS,
\r
1969 ARROWS, MATHEMATICAL_OPERATORS,
\r
1970 MISCELLANEOUS_TECHNICAL, CONTROL_PICTURES,
\r
1971 OPTICAL_CHARACTER_RECOGNITION, ENCLOSED_ALPHANUMERICS,
\r
1972 BOX_DRAWING, BLOCK_ELEMENTS,
\r
1973 GEOMETRIC_SHAPES, MISCELLANEOUS_SYMBOLS,
\r
1974 DINGBATS, BRAILLE_PATTERNS,
\r
1975 CJK_RADICALS_SUPPLEMENT, KANGXI_RADICALS,
\r
1976 IDEOGRAPHIC_DESCRIPTION_CHARACTERS, CJK_SYMBOLS_AND_PUNCTUATION,
\r
1977 HIRAGANA, KATAKANA,
\r
1978 BOPOMOFO, HANGUL_COMPATIBILITY_JAMO,
\r
1979 KANBUN, BOPOMOFO_EXTENDED,
\r
1980 ENCLOSED_CJK_LETTERS_AND_MONTHS, CJK_COMPATIBILITY,
\r
1981 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, CJK_UNIFIED_IDEOGRAPHS,
\r
1982 YI_SYLLABLES, YI_RADICALS,
\r
1983 HANGUL_SYLLABLES, HIGH_SURROGATES,
\r
1984 HIGH_PRIVATE_USE_SURROGATES, LOW_SURROGATES,
\r
1985 PRIVATE_USE_AREA, CJK_COMPATIBILITY_IDEOGRAPHS,
\r
1986 ALPHABETIC_PRESENTATION_FORMS, ARABIC_PRESENTATION_FORMS_A,
\r
1987 COMBINING_HALF_MARKS, CJK_COMPATIBILITY_FORMS,
\r
1988 SMALL_FORM_VARIANTS, ARABIC_PRESENTATION_FORMS_B,
\r
1989 SPECIALS, HALFWIDTH_AND_FULLWIDTH_FORMS,
\r
1990 OLD_ITALIC, GOTHIC,
\r
1991 DESERET, BYZANTINE_MUSICAL_SYMBOLS,
\r
1992 MUSICAL_SYMBOLS, MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
\r
1993 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
\r
1994 CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
\r
1995 TAGS, CYRILLIC_SUPPLEMENT,
\r
1996 TAGALOG, HANUNOO,
\r
1998 MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, SUPPLEMENTAL_ARROWS_A,
\r
1999 SUPPLEMENTAL_ARROWS_B, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
\r
2000 SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
\r
2001 KATAKANA_PHONETIC_EXTENSIONS,
\r
2002 VARIATION_SELECTORS, SUPPLEMENTARY_PRIVATE_USE_AREA_A,
\r
2003 SUPPLEMENTARY_PRIVATE_USE_AREA_B,
\r
2004 LIMBU, TAI_LE, KHMER_SYMBOLS, PHONETIC_EXTENSIONS,
\r
2005 MISCELLANEOUS_SYMBOLS_AND_ARROWS, YIJING_HEXAGRAM_SYMBOLS,
\r
2006 LINEAR_B_SYLLABARY, LINEAR_B_IDEOGRAMS, AEGEAN_NUMBERS,
\r
2007 UGARITIC, SHAVIAN, OSMANYA, CYPRIOT_SYLLABARY,
\r
2008 TAI_XUAN_JING_SYMBOLS, VARIATION_SELECTORS_SUPPLEMENT,
\r
2010 /* New blocks in Unicode 4.1 */
\r
2011 ANCIENT_GREEK_MUSICAL_NOTATION,
\r
2012 ANCIENT_GREEK_NUMBERS,
\r
2013 ARABIC_SUPPLEMENT,
\r
2016 COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
\r
2018 ETHIOPIC_EXTENDED,
\r
2019 ETHIOPIC_SUPPLEMENT,
\r
2020 GEORGIAN_SUPPLEMENT,
\r
2023 MODIFIER_TONE_LETTERS,
\r
2026 PHONETIC_EXTENSIONS_SUPPLEMENT,
\r
2027 SUPPLEMENTAL_PUNCTUATION,
\r
2038 CUNEIFORM_NUMBERS_AND_PUNCTUATION,
\r
2039 COUNTING_ROD_NUMERALS,
\r
2041 /* New blocks in Unicode 5.8 */
\r
2045 CYRILLIC_EXTENDED_A,
\r
2047 CYRILLIC_EXTENDED_B,
\r
2062 if (COUNT!=BLOCKS_.length) {
\r
2063 throw new java.lang.IllegalStateException("UnicodeBlock fields are inconsistent!");
\r
2067 * Identification code for this UnicodeBlock
\r
2069 private int m_id_;
\r
2071 // private constructor ----------------------------------------------
\r
2074 * UnicodeBlock constructor
\r
2075 * @param name name of this UnicodeBlock
\r
2076 * @param id unique id of this UnicodeBlock
\r
2077 * @exception NullPointerException if name is <code>null</code>
\r
2079 private UnicodeBlock(String name, int id)
\r
2087 * East Asian Width constants.
\r
2088 * @see UProperty#EAST_ASIAN_WIDTH
\r
2089 * @see UCharacter#getIntPropertyValue
\r
2092 public static interface EastAsianWidth
\r
2097 public static final int NEUTRAL = 0;
\r
2101 public static final int AMBIGUOUS = 1;
\r
2105 public static final int HALFWIDTH = 2;
\r
2109 public static final int FULLWIDTH = 3;
\r
2113 public static final int NARROW = 4;
\r
2117 public static final int WIDE = 5;
\r
2121 public static final int COUNT = 6;
\r
2125 * Decomposition Type constants.
\r
2126 * @see UProperty#DECOMPOSITION_TYPE
\r
2129 public static interface DecompositionType
\r
2134 public static final int NONE = 0;
\r
2138 public static final int CANONICAL = 1;
\r
2142 public static final int COMPAT = 2;
\r
2146 public static final int CIRCLE = 3;
\r
2150 public static final int FINAL = 4;
\r
2154 public static final int FONT = 5;
\r
2158 public static final int FRACTION = 6;
\r
2162 public static final int INITIAL = 7;
\r
2166 public static final int ISOLATED = 8;
\r
2170 public static final int MEDIAL = 9;
\r
2174 public static final int NARROW = 10;
\r
2178 public static final int NOBREAK = 11;
\r
2182 public static final int SMALL = 12;
\r
2186 public static final int SQUARE = 13;
\r
2190 public static final int SUB = 14;
\r
2194 public static final int SUPER = 15;
\r
2198 public static final int VERTICAL = 16;
\r
2202 public static final int WIDE = 17;
\r
2206 public static final int COUNT = 18;
\r
2210 * Joining Type constants.
\r
2211 * @see UProperty#JOINING_TYPE
\r
2214 public static interface JoiningType
\r
2219 public static final int NON_JOINING = 0;
\r
2223 public static final int JOIN_CAUSING = 1;
\r
2227 public static final int DUAL_JOINING = 2;
\r
2231 public static final int LEFT_JOINING = 3;
\r
2235 public static final int RIGHT_JOINING = 4;
\r
2239 public static final int TRANSPARENT = 5;
\r
2243 public static final int COUNT = 6;
\r
2247 * Joining Group constants.
\r
2248 * @see UProperty#JOINING_GROUP
\r
2251 public static interface JoiningGroup
\r
2256 public static final int NO_JOINING_GROUP = 0;
\r
2260 public static final int AIN = 1;
\r
2264 public static final int ALAPH = 2;
\r
2268 public static final int ALEF = 3;
\r
2272 public static final int BEH = 4;
\r
2276 public static final int BETH = 5;
\r
2280 public static final int DAL = 6;
\r
2284 public static final int DALATH_RISH = 7;
\r
2288 public static final int E = 8;
\r
2292 public static final int FEH = 9;
\r
2296 public static final int FINAL_SEMKATH = 10;
\r
2300 public static final int GAF = 11;
\r
2304 public static final int GAMAL = 12;
\r
2308 public static final int HAH = 13;
\r
2312 public static final int HAMZA_ON_HEH_GOAL = 14;
\r
2316 public static final int HE = 15;
\r
2320 public static final int HEH = 16;
\r
2324 public static final int HEH_GOAL = 17;
\r
2328 public static final int HETH = 18;
\r
2332 public static final int KAF = 19;
\r
2336 public static final int KAPH = 20;
\r
2340 public static final int KNOTTED_HEH = 21;
\r
2344 public static final int LAM = 22;
\r
2348 public static final int LAMADH = 23;
\r
2352 public static final int MEEM = 24;
\r
2356 public static final int MIM = 25;
\r
2360 public static final int NOON = 26;
\r
2364 public static final int NUN = 27;
\r
2368 public static final int PE = 28;
\r
2372 public static final int QAF = 29;
\r
2376 public static final int QAPH = 30;
\r
2380 public static final int REH = 31;
\r
2384 public static final int REVERSED_PE = 32;
\r
2388 public static final int SAD = 33;
\r
2392 public static final int SADHE = 34;
\r
2396 public static final int SEEN = 35;
\r
2400 public static final int SEMKATH = 36;
\r
2404 public static final int SHIN = 37;
\r
2408 public static final int SWASH_KAF = 38;
\r
2412 public static final int SYRIAC_WAW = 39;
\r
2416 public static final int TAH = 40;
\r
2420 public static final int TAW = 41;
\r
2424 public static final int TEH_MARBUTA = 42;
\r
2428 public static final int TETH = 43;
\r
2432 public static final int WAW = 44;
\r
2436 public static final int YEH = 45;
\r
2440 public static final int YEH_BARREE = 46;
\r
2444 public static final int YEH_WITH_TAIL = 47;
\r
2448 public static final int YUDH = 48;
\r
2452 public static final int YUDH_HE = 49;
\r
2456 public static final int ZAIN = 50;
\r
2458 * @stable ICU 2.6
\r
2460 public static final int FE = 51;
\r
2462 * @stable ICU 2.6
\r
2464 public static final int KHAPH = 52;
\r
2466 * @stable ICU 2.6
\r
2468 public static final int ZHAIN = 53;
\r
2470 * @stable ICU 4.0
\r
2472 public static final int BURUSHASKI_YEH_BARREE = 54;
\r
2476 public static final int COUNT = 55;
\r
2480 * Grapheme Cluster Break constants.
\r
2481 * @see UProperty#GRAPHEME_CLUSTER_BREAK
\r
2484 public static interface GraphemeClusterBreak {
\r
2488 public static final int OTHER = 0;
\r
2492 public static final int CONTROL = 1;
\r
2496 public static final int CR = 2;
\r
2500 public static final int EXTEND = 3;
\r
2504 public static final int L = 4;
\r
2508 public static final int LF = 5;
\r
2512 public static final int LV = 6;
\r
2516 public static final int LVT = 7;
\r
2520 public static final int T = 8;
\r
2524 public static final int V = 9;
\r
2528 public static final int SPACING_MARK = 10;
\r
2532 public static final int PREPEND = 11;
\r
2536 public static final int COUNT = 12;
\r
2540 * Word Break constants.
\r
2541 * @see UProperty#WORD_BREAK
\r
2544 public static interface WordBreak {
\r
2548 public static final int OTHER = 0;
\r
2552 public static final int ALETTER = 1;
\r
2556 public static final int FORMAT = 2;
\r
2560 public static final int KATAKANA = 3;
\r
2564 public static final int MIDLETTER = 4;
\r
2568 public static final int MIDNUM = 5;
\r
2572 public static final int NUMERIC = 6;
\r
2576 public static final int EXTENDNUMLET = 7;
\r
2580 public static final int CR = 8;
\r
2584 public static final int EXTEND = 9;
\r
2588 public static final int LF = 10;
\r
2592 public static final int MIDNUMLET = 11;
\r
2596 public static final int NEWLINE = 12;
\r
2600 public static final int COUNT = 13;
\r
2604 * Sentence Break constants.
\r
2605 * @see UProperty#SENTENCE_BREAK
\r
2608 public static interface SentenceBreak {
\r
2612 public static final int OTHER = 0;
\r
2616 public static final int ATERM = 1;
\r
2620 public static final int CLOSE = 2;
\r
2624 public static final int FORMAT = 3;
\r
2628 public static final int LOWER = 4;
\r
2632 public static final int NUMERIC = 5;
\r
2636 public static final int OLETTER = 6;
\r
2640 public static final int SEP = 7;
\r
2644 public static final int SP = 8;
\r
2648 public static final int STERM = 9;
\r
2652 public static final int UPPER = 10;
\r
2656 public static final int CR = 11;
\r
2660 public static final int EXTEND = 12;
\r
2664 public static final int LF = 13;
\r
2668 public static final int SCONTINUE = 14;
\r
2672 public static final int COUNT = 15;
\r
2676 * Line Break constants.
\r
2677 * @see UProperty#LINE_BREAK
\r
2680 public static interface LineBreak
\r
2685 public static final int UNKNOWN = 0;
\r
2689 public static final int AMBIGUOUS = 1;
\r
2693 public static final int ALPHABETIC = 2;
\r
2697 public static final int BREAK_BOTH = 3;
\r
2701 public static final int BREAK_AFTER = 4;
\r
2705 public static final int BREAK_BEFORE = 5;
\r
2709 public static final int MANDATORY_BREAK = 6;
\r
2713 public static final int CONTINGENT_BREAK = 7;
\r
2717 public static final int CLOSE_PUNCTUATION = 8;
\r
2721 public static final int COMBINING_MARK = 9;
\r
2725 public static final int CARRIAGE_RETURN = 10;
\r
2729 public static final int EXCLAMATION = 11;
\r
2733 public static final int GLUE = 12;
\r
2737 public static final int HYPHEN = 13;
\r
2741 public static final int IDEOGRAPHIC = 14;
\r
2743 * @see #INSEPARABLE
\r
2746 public static final int INSEPERABLE = 15;
\r
2748 * Renamed from the misspelled "inseperable" in Unicode 4.0.1.
\r
2751 public static final int INSEPARABLE = 15;
\r
2755 public static final int INFIX_NUMERIC = 16;
\r
2759 public static final int LINE_FEED = 17;
\r
2763 public static final int NONSTARTER = 18;
\r
2767 public static final int NUMERIC = 19;
\r
2771 public static final int OPEN_PUNCTUATION = 20;
\r
2775 public static final int POSTFIX_NUMERIC = 21;
\r
2779 public static final int PREFIX_NUMERIC = 22;
\r
2783 public static final int QUOTATION = 23;
\r
2787 public static final int COMPLEX_CONTEXT = 24;
\r
2791 public static final int SURROGATE = 25;
\r
2795 public static final int SPACE = 26;
\r
2799 public static final int BREAK_SYMBOLS = 27;
\r
2803 public static final int ZWSPACE = 28;
\r
2808 public static final int NEXT_LINE = 29; /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */
\r
2813 public static final int WORD_JOINER = 30; /*[WJ]*/
\r
2815 /* from here on: new in Unicode 4.1/ICU 3.4 */
\r
2820 public static final int H2 = 31;
\r
2824 public static final int H3 = 32;
\r
2828 public static final int JL = 33;
\r
2832 public static final int JT = 34;
\r
2836 public static final int JV = 35;
\r
2841 public static final int COUNT = 36;
\r
2845 * Numeric Type constants.
\r
2846 * @see UProperty#NUMERIC_TYPE
\r
2849 public static interface NumericType
\r
2854 public static final int NONE = 0;
\r
2858 public static final int DECIMAL = 1;
\r
2862 public static final int DIGIT = 2;
\r
2866 public static final int NUMERIC = 3;
\r
2870 public static final int COUNT = 4;
\r
2874 * Hangul Syllable Type constants.
\r
2876 * @see UProperty#HANGUL_SYLLABLE_TYPE
\r
2879 public static interface HangulSyllableType
\r
2884 public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
\r
2888 public static final int LEADING_JAMO = 1; /*[L]*/
\r
2892 public static final int VOWEL_JAMO = 2; /*[V]*/
\r
2896 public static final int TRAILING_JAMO = 3; /*[T]*/
\r
2900 public static final int LV_SYLLABLE = 4; /*[LV]*/
\r
2904 public static final int LVT_SYLLABLE = 5; /*[LVT]*/
\r
2908 public static final int COUNT = 6;
\r
2911 // public data members -----------------------------------------------
\r
2914 * The lowest Unicode code point value.
\r
2917 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
\r
2920 * The highest Unicode code point value (scalar value) according to the
\r
2921 * Unicode Standard.
\r
2922 * This is a 21-bit value (21 bits, rounded up).<br>
\r
2923 * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
\r
2926 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
\r
2929 * The minimum value for Supplementary code points
\r
2932 public static final int SUPPLEMENTARY_MIN_VALUE =
\r
2933 UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
2936 * Unicode value used when translating into Unicode encoding form and there
\r
2937 * is no existing character.
\r
2940 public static final int REPLACEMENT_CHAR = '\uFFFD';
\r
2943 * Special value that is returned by getUnicodeNumericValue(int) when no
\r
2944 * numeric value is defined for a code point.
\r
2946 * @see #getUnicodeNumericValue
\r
2948 public static final double NO_NUMERIC_VALUE = -123456789;
\r
2951 * Compatibility constant for Java Character's MIN_RADIX.
\r
2954 public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
\r
2957 * Compatibility constant for Java Character's MAX_RADIX.
\r
2960 public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
\r
2963 * Do not lowercase non-initial parts of words when titlecasing.
\r
2964 * Option bit for titlecasing APIs that take an options bit set.
\r
2966 * By default, titlecasing will titlecase the first cased character
\r
2967 * of a word and lowercase all other characters.
\r
2968 * With this option, the other characters will not be modified.
\r
2970 * @see #toTitleCase
\r
2973 public static final int TITLECASE_NO_LOWERCASE = 0x100;
\r
2976 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
\r
2977 * titlecase exactly the characters at breaks from the iterator.
\r
2978 * Option bit for titlecasing APIs that take an options bit set.
\r
2980 * By default, titlecasing will take each break iterator index,
\r
2981 * adjust it by looking for the next cased character, and titlecase that one.
\r
2982 * Other characters are lowercased.
\r
2984 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
\r
2986 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
\r
2987 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
\r
2988 * cased character F. If F exists, map F to default_title(F); then map each
\r
2989 * subsequent character C to default_lower(C).
\r
2991 * @see #toTitleCase
\r
2992 * @see #TITLECASE_NO_LOWERCASE
\r
2995 public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
\r
2997 // public methods ----------------------------------------------------
\r
3000 * Retrieves the numeric value of a decimal digit code point.
\r
3001 * <br>This method observes the semantics of
\r
3002 * <code>java.lang.Character.digit()</code>. Note that this
\r
3003 * will return positive values for code points for which isDigit
\r
3004 * returns false, just like java.lang.Character.
\r
3005 * <br><em>Semantic Change:</em> In release 1.3.1 and
\r
3006 * prior, this did not treat the European letters as having a
\r
3007 * digit value, and also treated numeric letters and other numbers as
\r
3009 * This has been changed to conform to the java semantics.
\r
3010 * <br>A code point is a valid digit if and only if:
\r
3012 * <li>ch is a decimal digit or one of the european letters, and
\r
3013 * <li>the value of ch is less than the specified radix.
\r
3015 * @param ch the code point to query
\r
3016 * @param radix the radix
\r
3017 * @return the numeric value represented by the code point in the
\r
3018 * specified radix, or -1 if the code point is not a decimal digit
\r
3019 * or if its value is too large for the radix
\r
3022 public static int digit(int ch, int radix)
\r
3024 // when ch is out of bounds getProperty == 0
\r
3025 int props = getProperty(ch);
\r
3027 if (getNumericType(props) == NumericType.DECIMAL) {
\r
3028 value = UCharacterProperty.getUnsignedValue(props);
\r
3030 value = getEuropeanDigit(ch);
\r
3032 return (0 <= value && value < radix) ? value : -1;
\r
3036 * Retrieves the numeric value of a decimal digit code point.
\r
3037 * <br>This is a convenience overload of <code>digit(int, int)</code>
\r
3038 * that provides a decimal radix.
\r
3039 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
\r
3040 * treated numeric letters and other numbers as digits. This has
\r
3041 * been changed to conform to the java semantics.
\r
3042 * @param ch the code point to query
\r
3043 * @return the numeric value represented by the code point,
\r
3044 * or -1 if the code point is not a decimal digit or if its
\r
3045 * value is too large for a decimal radix
\r
3048 public static int digit(int ch)
\r
3050 int props = getProperty(ch);
\r
3051 if (getNumericType(props) == NumericType.DECIMAL) {
\r
3052 return UCharacterProperty.getUnsignedValue(props);
\r
3059 * Returns the numeric value of the code point as a nonnegative
\r
3061 * <br>If the code point does not have a numeric value, then -1 is returned.
\r
3063 * If the code point has a numeric value that cannot be represented as a
\r
3064 * nonnegative integer (for example, a fractional value), then -2 is
\r
3066 * @param ch the code point to query
\r
3067 * @return the numeric value of the code point, or -1 if it has no numeric
\r
3068 * value, or -2 if it has a numeric value that cannot be represented as a
\r
3069 * nonnegative integer
\r
3072 public static int getNumericValue(int ch)
\r
3074 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
\r
3075 int props = PROPERTY_.getProperty(ch);
\r
3076 int numericType = getNumericType(props);
\r
3078 if(numericType==0) {
\r
3079 return getEuropeanDigit(ch);
\r
3081 if(numericType==UCharacterProperty.NT_FRACTION || numericType>=UCharacterProperty.NT_COUNT) {
\r
3085 int numericValue = UCharacterProperty.getUnsignedValue(props);
\r
3087 if(numericType<NumericType.COUNT) {
\r
3088 /* normal type, the value is stored directly */
\r
3089 return numericValue;
\r
3090 } else /* numericType==NT_LARGE */ {
\r
3091 /* large value with exponent */
\r
3095 mant=numericValue>>LARGE_MANT_SHIFT;
\r
3096 exp=numericValue&LARGE_EXP_MASK;
\r
3099 exp+=LARGE_EXP_OFFSET_EXTRA;
\r
3100 } else if(mant>9) {
\r
3101 return -2; /* reserved mantissa value */
\r
3103 exp+=LARGE_EXP_OFFSET;
\r
3111 /* multiply by 10^exp without math.h */
\r
3130 if(numValue<=Integer.MAX_VALUE) {
\r
3131 return (int)numValue;
\r
3139 * <p>Get the numeric value for a Unicode code point as defined in the
\r
3140 * Unicode Character Database.</p>
\r
3141 * <p>A "double" return type is necessary because some numeric values are
\r
3142 * fractions, negative, or too large for int.</p>
\r
3143 * <p>For characters without any numeric values in the Unicode Character
\r
3144 * Database, this function will return NO_NUMERIC_VALUE.</p>
\r
3145 * <p><em>API Change:</em> In release 2.2 and prior, this API has a
\r
3146 * return type int and returns -1 when the argument ch does not have a
\r
3147 * corresponding numeric value. This has been changed to synch with ICU4C
\r
3149 * This corresponds to the ICU4C function u_getNumericValue.
\r
3150 * @param ch Code point to get the numeric value for.
\r
3151 * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
\r
3154 public static double getUnicodeNumericValue(int ch)
\r
3156 // equivalent to c version double u_getNumericValue(UChar32 c)
\r
3157 int props = PROPERTY_.getProperty(ch);
\r
3158 int numericType = getNumericType(props);
\r
3160 if(numericType==0 || numericType>=UCharacterProperty.NT_COUNT) {
\r
3161 return NO_NUMERIC_VALUE;
\r
3164 int numericValue = UCharacterProperty.getUnsignedValue(props);
\r
3166 if(numericType<NumericType.COUNT) {
\r
3167 /* normal type, the value is stored directly */
\r
3168 return numericValue;
\r
3169 } else if(numericType==UCharacterProperty.NT_FRACTION) {
\r
3170 /* fraction value */
\r
3171 int numerator, denominator;
\r
3173 numerator=numericValue>>FRACTION_NUM_SHIFT;
\r
3174 denominator=(numericValue&FRACTION_DEN_MASK)+FRACTION_DEN_OFFSET;
\r
3176 if(numerator==0) {
\r
3179 return (double)numerator/(double)denominator;
\r
3180 } else /* numericType==NT_LARGE */ {
\r
3181 /* large value with exponent */
\r
3185 mant=numericValue>>LARGE_MANT_SHIFT;
\r
3186 exp=numericValue&LARGE_EXP_MASK;
\r
3189 exp+=LARGE_EXP_OFFSET_EXTRA;
\r
3190 } else if(mant>9) {
\r
3191 return NO_NUMERIC_VALUE; /* reserved mantissa value */
\r
3193 exp+=LARGE_EXP_OFFSET;
\r
3198 /* multiply by 10^exp without math.h */
\r
3223 * Compatibility override of Java deprecated method. This
\r
3224 * method will always remain deprecated. Delegates to
\r
3225 * java.lang.Character.isSpace.
\r
3226 * @param ch the code point
\r
3227 * @return true if the code point is a space character as
\r
3228 * defined by java.lang.Character.isSpace.
\r
3229 * @deprecated ICU 3.4 (Java)
\r
3231 public static boolean isSpace(int ch) {
\r
3232 return ch <= 0x20 &&
\r
3233 (ch == 0x20 || ch == 0x09 || ch == 0x0a || ch == 0x0c || ch == 0x0d);
\r
3237 * Returns a value indicating a code point's Unicode category.
\r
3238 * Up-to-date Unicode implementation of java.lang.Character.getType()
\r
3239 * except for the above mentioned code points that had their category
\r
3241 * Return results are constants from the interface
\r
3242 * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
\r
3243 * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
\r
3244 * those returned by java.lang.Character.getType. UCharacterCategory values
\r
3245 * match the ones used in ICU4C, while java.lang.Character type
\r
3246 * values, though similar, skip the value 17.</p>
\r
3247 * @param ch code point whose type is to be determined
\r
3248 * @return category which is a value of UCharacterCategory
\r
3251 public static int getType(int ch)
\r
3253 return getProperty(ch) & UCharacterProperty.TYPE_MASK;
\r
3257 * Determines if a code point has a defined meaning in the up-to-date
\r
3258 * Unicode standard.
\r
3259 * E.g. supplementary code points though allocated space are not defined in
\r
3260 * Unicode yet.<br>
\r
3261 * Up-to-date Unicode implementation of java.lang.Character.isDefined()
\r
3262 * @param ch code point to be determined if it is defined in the most
\r
3263 * current version of Unicode
\r
3264 * @return true if this code point is defined in unicode
\r
3267 public static boolean isDefined(int ch)
\r
3269 return getType(ch) != 0;
\r
3273 * Determines if a code point is a Java digit.
\r
3274 * <br>This method observes the semantics of
\r
3275 * <code>java.lang.Character.isDigit()</code>. It returns true for decimal
\r
3277 * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this treated
\r
3278 * numeric letters and other numbers as digits.
\r
3279 * This has been changed to conform to the java semantics.
\r
3280 * @param ch code point to query
\r
3281 * @return true if this code point is a digit
\r
3284 public static boolean isDigit(int ch)
\r
3286 return getType(ch) == UCharacterCategory.DECIMAL_DIGIT_NUMBER;
\r
3290 * Determines if the specified code point is an ISO control character.
\r
3291 * A code point is considered to be an ISO control character if it is in
\r
3292 * the range \u0000 through \u001F or in the range \u007F through
\r
3294 * Up-to-date Unicode implementation of java.lang.Character.isISOControl()
\r
3295 * @param ch code point to determine if it is an ISO control character
\r
3296 * @return true if code point is a ISO control character
\r
3299 public static boolean isISOControl(int ch)
\r
3301 return ch >= 0 && ch <= APPLICATION_PROGRAM_COMMAND_ &&
\r
3302 ((ch <= UNIT_SEPARATOR_) || (ch >= DELETE_));
\r
3306 * Determines if the specified code point is a letter.
\r
3307 * Up-to-date Unicode implementation of java.lang.Character.isLetter()
\r
3308 * @param ch code point to determine if it is a letter
\r
3309 * @return true if code point is a letter
\r
3312 public static boolean isLetter(int ch)
\r
3314 // if props == 0, it will just fall through and return false
\r
3315 return ((1 << getType(ch))
\r
3316 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3317 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3318 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3319 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3320 | (1 << UCharacterCategory.OTHER_LETTER))) != 0;
\r
3324 * Determines if the specified code point is a letter or digit.
\r
3325 * Note this method, unlike java.lang.Character does not regard the ascii
\r
3326 * characters 'A' - 'Z' and 'a' - 'z' as digits.
\r
3327 * @param ch code point to determine if it is a letter or a digit
\r
3328 * @return true if code point is a letter or a digit
\r
3331 public static boolean isLetterOrDigit(int ch)
\r
3333 return ((1 << getType(ch))
\r
3334 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3335 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3336 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3337 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3338 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3339 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER))) != 0;
\r
3343 * Compatibility override of Java deprecated method. This
\r
3344 * method will always remain deprecated. Delegates to
\r
3345 * java.lang.Character.isJavaIdentifierStart.
\r
3346 * @param cp the code point
\r
3347 * @return true if the code point can start a java identifier.
\r
3348 * @deprecated ICU 3.4 (Java)
\r
3350 public static boolean isJavaLetter(int cp) {
\r
3351 return isJavaIdentifierStart(cp);
\r
3355 * Compatibility override of Java deprecated method. This
\r
3356 * method will always remain deprecated. Delegates to
\r
3357 * java.lang.Character.isJavaIdentifierPart.
\r
3358 * @param cp the code point
\r
3359 * @return true if the code point can continue a java identifier.
\r
3360 * @deprecated ICU 3.4 (Java)
\r
3362 public static boolean isJavaLetterOrDigit(int cp) {
\r
3363 return isJavaIdentifierPart(cp);
\r
3367 * Compatibility override of Java method, delegates to
\r
3368 * java.lang.Character.isJavaIdentifierStart.
\r
3369 * @param cp the code point
\r
3370 * @return true if the code point can start a java identifier.
\r
3373 public static boolean isJavaIdentifierStart(int cp) {
\r
3374 // note, downcast to char for jdk 1.4 compatibility
\r
3375 return java.lang.Character.isJavaIdentifierStart((char)cp);
\r
3379 * Compatibility override of Java method, delegates to
\r
3380 * java.lang.Character.isJavaIdentifierPart.
\r
3381 * @param cp the code point
\r
3382 * @return true if the code point can continue a java identifier.
\r
3385 public static boolean isJavaIdentifierPart(int cp) {
\r
3386 // note, downcast to char for jdk 1.4 compatibility
\r
3387 return java.lang.Character.isJavaIdentifierPart((char)cp);
\r
3391 * Determines if the specified code point is a lowercase character.
\r
3392 * UnicodeData only contains case mappings for code points where they are
\r
3393 * one-to-one mappings; it also omits information about context-sensitive
\r
3394 * case mappings.<br> For more information about Unicode case mapping
\r
3395 * please refer to the
\r
3396 * <a href=http://www.unicode.org/unicode/reports/tr21/>Technical report
\r
3398 * Up-to-date Unicode implementation of java.lang.Character.isLowerCase()
\r
3399 * @param ch code point to determine if it is in lowercase
\r
3400 * @return true if code point is a lowercase character
\r
3403 public static boolean isLowerCase(int ch)
\r
3405 // if props == 0, it will just fall through and return false
\r
3406 return getType(ch) == UCharacterCategory.LOWERCASE_LETTER;
\r
3410 * Determines if the specified code point is a white space character.
\r
3411 * A code point is considered to be an whitespace character if and only
\r
3412 * if it satisfies one of the following criteria:
\r
3414 * <li> It is a Unicode space character (categories "Zs" or "Zl" or "Zp"), but is not
\r
3415 * also a no-break space (\u00A0 or \u2007 or \u202F).
\r
3416 * <li> It is \u0009, HORIZONTAL TABULATION.
\r
3417 * <li> It is \u000A, LINE FEED.
\r
3418 * <li> It is \u000B, VERTICAL TABULATION.
\r
3419 * <li> It is \u000C, FORM FEED.
\r
3420 * <li> It is \u000D, CARRIAGE RETURN.
\r
3421 * <li> It is \u001C, FILE SEPARATOR.
\r
3422 * <li> It is \u001D, GROUP SEPARATOR.
\r
3423 * <li> It is \u001E, RECORD SEPARATOR.
\r
3424 * <li> It is \u001F, UNIT SEPARATOR.
\r
3427 * This API tries to synch to the semantics of the Java API,
\r
3428 * java.lang.Character.isWhitespace(), but it may not return
\r
3429 * the exactly same results because of the Unicode version
\r
3431 * @param ch code point to determine if it is a white space
\r
3432 * @return true if the specified code point is a white space character
\r
3435 public static boolean isWhitespace(int ch)
\r
3437 // exclude no-break spaces
\r
3438 // if props == 0, it will just fall through and return false
\r
3439 return ((1 << getType(ch)) &
\r
3440 ((1 << UCharacterCategory.SPACE_SEPARATOR)
\r
3441 | (1 << UCharacterCategory.LINE_SEPARATOR)
\r
3442 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR))) != 0
\r
3443 && (ch != NO_BREAK_SPACE_) && (ch != FIGURE_SPACE_) && (ch != NARROW_NO_BREAK_SPACE_)
\r
3444 // TAB VT LF FF CR FS GS RS US NL are all control characters
\r
3445 // that are white spaces.
\r
3446 || (ch >= 0x9 && ch <= 0xd) || (ch >= 0x1c && ch <= 0x1f);
\r
3450 * Determines if the specified code point is a Unicode specified space
\r
3451 * character, i.e. if code point is in the category Zs, Zl and Zp.
\r
3452 * Up-to-date Unicode implementation of java.lang.Character.isSpaceChar().
\r
3453 * @param ch code point to determine if it is a space
\r
3454 * @return true if the specified code point is a space character
\r
3457 public static boolean isSpaceChar(int ch)
\r
3459 // if props == 0, it will just fall through and return false
\r
3460 return ((1 << getType(ch)) & ((1 << UCharacterCategory.SPACE_SEPARATOR)
\r
3461 | (1 << UCharacterCategory.LINE_SEPARATOR)
\r
3462 | (1 << UCharacterCategory.PARAGRAPH_SEPARATOR)))
\r
3467 * Determines if the specified code point is a titlecase character.
\r
3468 * UnicodeData only contains case mappings for code points where they are
\r
3469 * one-to-one mappings; it also omits information about context-sensitive
\r
3470 * case mappings.<br>
\r
3471 * For more information about Unicode case mapping please refer to the
\r
3472 * <a href=http://www.unicode.org/unicode/reports/tr21/>
\r
3473 * Technical report #21</a>.<br>
\r
3474 * Up-to-date Unicode implementation of java.lang.Character.isTitleCase().
\r
3475 * @param ch code point to determine if it is in title case
\r
3476 * @return true if the specified code point is a titlecase character
\r
3479 public static boolean isTitleCase(int ch)
\r
3481 // if props == 0, it will just fall through and return false
\r
3482 return getType(ch) == UCharacterCategory.TITLECASE_LETTER;
\r
3486 * Determines if the specified code point may be any part of a Unicode
\r
3487 * identifier other than the starting character.
\r
3488 * A code point may be part of a Unicode identifier if and only if it is
\r
3489 * one of the following:
\r
3491 * <li> Lu Uppercase letter
\r
3492 * <li> Ll Lowercase letter
\r
3493 * <li> Lt Titlecase letter
\r
3494 * <li> Lm Modifier letter
\r
3495 * <li> Lo Other letter
\r
3496 * <li> Nl Letter number
\r
3497 * <li> Pc Connecting punctuation character
\r
3498 * <li> Nd decimal number
\r
3499 * <li> Mc Spacing combining mark
\r
3500 * <li> Mn Non-spacing mark
\r
3501 * <li> Cf formatting code
\r
3503 * Up-to-date Unicode implementation of
\r
3504 * java.lang.Character.isUnicodeIdentifierPart().<br>
\r
3505 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3506 * @param ch code point to determine if is can be part of a Unicode
\r
3508 * @return true if code point is any character belonging a unicode
\r
3509 * identifier suffix after the first character
\r
3512 public static boolean isUnicodeIdentifierPart(int ch)
\r
3514 // if props == 0, it will just fall through and return false
\r
3516 return ((1 << getType(ch))
\r
3517 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3518 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3519 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3520 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3521 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3522 | (1 << UCharacterCategory.LETTER_NUMBER)
\r
3523 | (1 << UCharacterCategory.CONNECTOR_PUNCTUATION)
\r
3524 | (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER)
\r
3525 | (1 << UCharacterCategory.COMBINING_SPACING_MARK)
\r
3526 | (1 << UCharacterCategory.NON_SPACING_MARK))) != 0
\r
3527 || isIdentifierIgnorable(ch);
\r
3531 * Determines if the specified code point is permissible as the first
\r
3532 * character in a Unicode identifier.
\r
3533 * A code point may start a Unicode identifier if it is of type either
\r
3535 * <li> Lu Uppercase letter
\r
3536 * <li> Ll Lowercase letter
\r
3537 * <li> Lt Titlecase letter
\r
3538 * <li> Lm Modifier letter
\r
3539 * <li> Lo Other letter
\r
3540 * <li> Nl Letter number
\r
3542 * Up-to-date Unicode implementation of
\r
3543 * java.lang.Character.isUnicodeIdentifierStart().<br>
\r
3544 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3545 * @param ch code point to determine if it can start a Unicode identifier
\r
3546 * @return true if code point is the first character belonging a unicode
\r
3550 public static boolean isUnicodeIdentifierStart(int ch)
\r
3552 /*int cat = getType(ch);*/
\r
3553 // if props == 0, it will just fall through and return false
\r
3554 return ((1 << getType(ch))
\r
3555 & ((1 << UCharacterCategory.UPPERCASE_LETTER)
\r
3556 | (1 << UCharacterCategory.LOWERCASE_LETTER)
\r
3557 | (1 << UCharacterCategory.TITLECASE_LETTER)
\r
3558 | (1 << UCharacterCategory.MODIFIER_LETTER)
\r
3559 | (1 << UCharacterCategory.OTHER_LETTER)
\r
3560 | (1 << UCharacterCategory.LETTER_NUMBER))) != 0;
\r
3564 * Determines if the specified code point should be regarded as an
\r
3565 * ignorable character in a Unicode identifier.
\r
3566 * A character is ignorable in the Unicode standard if it is of the type
\r
3567 * Cf, Formatting code.<br>
\r
3568 * Up-to-date Unicode implementation of
\r
3569 * java.lang.Character.isIdentifierIgnorable().<br>
\r
3570 * See <a href=http://www.unicode.org/unicode/reports/tr8/>UTR #8</a>.
\r
3571 * @param ch code point to be determined if it can be ignored in a Unicode
\r
3573 * @return true if the code point is ignorable
\r
3576 public static boolean isIdentifierIgnorable(int ch)
\r
3578 // see java.lang.Character.isIdentifierIgnorable() on range of
\r
3579 // ignorable characters.
\r
3581 return isISOControl(ch)
\r
3582 && !((ch >= 0x9 && ch <= 0xd)
\r
3583 || (ch >= 0x1c && ch <= 0x1f));
\r
3585 return getType(ch) == UCharacterCategory.FORMAT;
\r
3589 * Determines if the specified code point is an uppercase character.
\r
3590 * UnicodeData only contains case mappings for code point where they are
\r
3591 * one-to-one mappings; it also omits information about context-sensitive
\r
3592 * case mappings.<br>
\r
3593 * For language specific case conversion behavior, use
\r
3594 * toUpperCase(locale, str). <br>
\r
3595 * For example, the case conversion for dot-less i and dotted I in Turkish,
\r
3596 * or for final sigma in Greek.
\r
3597 * For more information about Unicode case mapping please refer to the
\r
3598 * <a href=http://www.unicode.org/unicode/reports/tr21/>
\r
3599 * Technical report #21</a>.<br>
\r
3600 * Up-to-date Unicode implementation of java.lang.Character.isUpperCase().
\r
3601 * @param ch code point to determine if it is in uppercase
\r
3602 * @return true if the code point is an uppercase character
\r
3605 public static boolean isUpperCase(int ch)
\r
3607 // if props == 0, it will just fall through and return false
\r
3608 return getType(ch) == UCharacterCategory.UPPERCASE_LETTER;
\r
3612 * The given code point is mapped to its lowercase equivalent; if the code
\r
3613 * point has no lowercase equivalent, the code point itself is returned.
\r
3614 * Up-to-date Unicode implementation of java.lang.Character.toLowerCase()
\r
3616 * <p>This function only returns the simple, single-code point case mapping.
\r
3617 * Full case mappings should be used whenever possible because they produce
\r
3618 * better results by working on whole strings.
\r
3619 * They take into account the string context and the language and can map
\r
3620 * to a result string with a different length as appropriate.
\r
3621 * Full case mappings are applied by the case mapping functions
\r
3622 * that take String parameters rather than code points (int).
\r
3623 * See also the User Guide chapter on C/POSIX migration:
\r
3624 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3626 * @param ch code point whose lowercase equivalent is to be retrieved
\r
3627 * @return the lowercase equivalent code point
\r
3630 public static int toLowerCase(int ch) {
\r
3631 return gCsp.tolower(ch);
\r
3635 * Converts argument code point and returns a String object representing
\r
3636 * the code point's value in UTF16 format.
\r
3637 * The result is a string whose length is 1 for non-supplementary code
\r
3638 * points, 2 otherwise.<br>
\r
3639 * com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
\r
3641 * Up-to-date Unicode implementation of java.lang.Character.toString()
\r
3642 * @param ch code point
\r
3643 * @return string representation of the code point, null if code point is not
\r
3644 * defined in unicode
\r
3647 public static String toString(int ch)
\r
3649 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
3653 if (ch < SUPPLEMENTARY_MIN_VALUE) {
\r
3654 return String.valueOf((char)ch);
\r
3657 StringBuffer result = new StringBuffer();
\r
3658 result.append(UTF16.getLeadSurrogate(ch));
\r
3659 result.append(UTF16.getTrailSurrogate(ch));
\r
3660 return result.toString();
\r
3664 * Converts the code point argument to titlecase.
\r
3665 * If no titlecase is available, the uppercase is returned. If no uppercase
\r
3666 * is available, the code point itself is returned.
\r
3667 * Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
\r
3669 * <p>This function only returns the simple, single-code point case mapping.
\r
3670 * Full case mappings should be used whenever possible because they produce
\r
3671 * better results by working on whole strings.
\r
3672 * They take into account the string context and the language and can map
\r
3673 * to a result string with a different length as appropriate.
\r
3674 * Full case mappings are applied by the case mapping functions
\r
3675 * that take String parameters rather than code points (int).
\r
3676 * See also the User Guide chapter on C/POSIX migration:
\r
3677 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3679 * @param ch code point whose title case is to be retrieved
\r
3680 * @return titlecase code point
\r
3683 public static int toTitleCase(int ch) {
\r
3684 return gCsp.totitle(ch);
\r
3688 * Converts the character argument to uppercase.
\r
3689 * If no uppercase is available, the character itself is returned.
\r
3690 * Up-to-date Unicode implementation of java.lang.Character.toUpperCase()
\r
3692 * <p>This function only returns the simple, single-code point case mapping.
\r
3693 * Full case mappings should be used whenever possible because they produce
\r
3694 * better results by working on whole strings.
\r
3695 * They take into account the string context and the language and can map
\r
3696 * to a result string with a different length as appropriate.
\r
3697 * Full case mappings are applied by the case mapping functions
\r
3698 * that take String parameters rather than code points (int).
\r
3699 * See also the User Guide chapter on C/POSIX migration:
\r
3700 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
3702 * @param ch code point whose uppercase is to be retrieved
\r
3703 * @return uppercase code point
\r
3706 public static int toUpperCase(int ch) {
\r
3707 return gCsp.toupper(ch);
\r
3710 // extra methods not in java.lang.Character --------------------------
\r
3713 * Determines if the code point is a supplementary character.
\r
3714 * A code point is a supplementary character if and only if it is greater
\r
3715 * than <a href=#SUPPLEMENTARY_MIN_VALUE>SUPPLEMENTARY_MIN_VALUE</a>
\r
3716 * @param ch code point to be determined if it is in the supplementary
\r
3718 * @return true if code point is a supplementary character
\r
3721 public static boolean isSupplementary(int ch)
\r
3723 return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
\r
3724 ch <= UCharacter.MAX_VALUE;
\r
3728 * Determines if the code point is in the BMP plane.
\r
3729 * @param ch code point to be determined if it is not a supplementary
\r
3731 * @return true if code point is not a supplementary character
\r
3734 public static boolean isBMP(int ch)
\r
3736 return (ch >= 0 && ch <= LAST_CHAR_MASK_);
\r
3740 * Determines whether the specified code point is a printable character
\r
3741 * according to the Unicode standard.
\r
3742 * @param ch code point to be determined if it is printable
\r
3743 * @return true if the code point is a printable character
\r
3746 public static boolean isPrintable(int ch)
\r
3748 int cat = getType(ch);
\r
3749 // if props == 0, it will just fall through and return false
\r
3750 return (cat != UCharacterCategory.UNASSIGNED &&
\r
3751 cat != UCharacterCategory.CONTROL &&
\r
3752 cat != UCharacterCategory.FORMAT &&
\r
3753 cat != UCharacterCategory.PRIVATE_USE &&
\r
3754 cat != UCharacterCategory.SURROGATE &&
\r
3755 cat != UCharacterCategory.GENERAL_OTHER_TYPES);
\r
3759 * Determines whether the specified code point is of base form.
\r
3760 * A code point of base form does not graphically combine with preceding
\r
3761 * characters, and is neither a control nor a format character.
\r
3762 * @param ch code point to be determined if it is of base form
\r
3763 * @return true if the code point is of base form
\r
3766 public static boolean isBaseForm(int ch)
\r
3768 int cat = getType(ch);
\r
3769 // if props == 0, it will just fall through and return false
\r
3770 return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
\r
3771 cat == UCharacterCategory.OTHER_NUMBER ||
\r
3772 cat == UCharacterCategory.LETTER_NUMBER ||
\r
3773 cat == UCharacterCategory.UPPERCASE_LETTER ||
\r
3774 cat == UCharacterCategory.LOWERCASE_LETTER ||
\r
3775 cat == UCharacterCategory.TITLECASE_LETTER ||
\r
3776 cat == UCharacterCategory.MODIFIER_LETTER ||
\r
3777 cat == UCharacterCategory.OTHER_LETTER ||
\r
3778 cat == UCharacterCategory.NON_SPACING_MARK ||
\r
3779 cat == UCharacterCategory.ENCLOSING_MARK ||
\r
3780 cat == UCharacterCategory.COMBINING_SPACING_MARK;
\r
3784 * Returns the Bidirection property of a code point.
\r
3785 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
\r
3787 * Result returned belongs to the interface
\r
3788 * <a href=UCharacterDirection.html>UCharacterDirection</a>
\r
3789 * @param ch the code point to be determined its direction
\r
3790 * @return direction constant from UCharacterDirection.
\r
3793 public static int getDirection(int ch)
\r
3795 return gBdp.getClass(ch);
\r
3799 * Determines whether the code point has the "mirrored" property.
\r
3800 * This property is set for characters that are commonly used in
\r
3801 * Right-To-Left contexts and need to be displayed with a "mirrored"
\r
3803 * @param ch code point whose mirror is to be determined
\r
3804 * @return true if the code point has the "mirrored" property
\r
3807 public static boolean isMirrored(int ch)
\r
3809 return gBdp.isMirrored(ch);
\r
3813 * Maps the specified code point to a "mirror-image" code point.
\r
3814 * For code points with the "mirrored" property, implementations sometimes
\r
3815 * need a "poor man's" mapping to another code point such that the default
\r
3816 * glyph may serve as the mirror-image of the default glyph of the
\r
3817 * specified code point.<br>
\r
3818 * This is useful for text conversion to and from codepages with visual
\r
3819 * order, and for displays without glyph selection capabilities.
\r
3820 * @param ch code point whose mirror is to be retrieved
\r
3821 * @return another code point that may serve as a mirror-image substitute,
\r
3822 * or ch itself if there is no such mapping or ch does not have the
\r
3823 * "mirrored" property
\r
3826 public static int getMirror(int ch)
\r
3828 return gBdp.getMirror(ch);
\r
3832 * Gets the combining class of the argument codepoint
\r
3833 * @param ch code point whose combining is to be retrieved
\r
3834 * @return the combining class of the codepoint
\r
3837 public static int getCombiningClass(int ch)
\r
3839 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
3840 throw new IllegalArgumentException("Codepoint out of bounds");
\r
3842 return NormalizerImpl.getCombiningClass(ch);
\r
3846 * A code point is illegal if and only if
\r
3848 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
\r
3849 * <li> A surrogate value, 0xD800 to 0xDFFF
\r
3850 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
\r
3852 * Note: legal does not mean that it is assigned in this version of Unicode.
\r
3853 * @param ch code point to determine if it is a legal code point by itself
\r
3854 * @return true if and only if legal.
\r
3857 public static boolean isLegal(int ch)
\r
3859 if (ch < MIN_VALUE) {
\r
3862 if (ch < UTF16.SURROGATE_MIN_VALUE) {
\r
3865 if (ch <= UTF16.SURROGATE_MAX_VALUE) {
\r
3868 if (UCharacterUtility.isNonCharacter(ch)) {
\r
3871 return (ch <= MAX_VALUE);
\r
3875 * A string is legal iff all its code points are legal.
\r
3876 * A code point is illegal if and only if
\r
3878 * <li> Out of bounds, less than 0 or greater than UCharacter.MAX_VALUE
\r
3879 * <li> A surrogate value, 0xD800 to 0xDFFF
\r
3880 * <li> Not-a-character, having the form 0x xxFFFF or 0x xxFFFE
\r
3882 * Note: legal does not mean that it is assigned in this version of Unicode.
\r
3883 * @param str containing code points to examin
\r
3884 * @return true if and only if legal.
\r
3887 public static boolean isLegal(String str)
\r
3889 int size = str.length();
\r
3891 for (int i = 0; i < size; i ++)
\r
3893 codepoint = UTF16.charAt(str, i);
\r
3894 if (!isLegal(codepoint)) {
\r
3897 if (isSupplementary(codepoint)) {
\r
3905 * Gets the version of Unicode data used.
\r
3906 * @return the unicode version number used
\r
3909 public static VersionInfo getUnicodeVersion()
\r
3911 return PROPERTY_.m_unicodeVersion_;
\r
3915 * Retrieve the most current Unicode name of the argument code point, or
\r
3916 * null if the character is unassigned or outside the range
\r
3917 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
\r
3919 * Note calling any methods related to code point names, e.g. get*Name*()
\r
3920 * incurs a one-time initialisation cost to construct the name tables.
\r
3921 * @param ch the code point for which to get the name
\r
3922 * @return most current Unicode name
\r
3925 public static String getName(int ch)
\r
3928 throw new MissingResourceException("Could not load unames.icu","","");
\r
3930 return NAME_.getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
3934 * Gets the names for each of the characters in a string
\r
3935 * @param s string to format
\r
3936 * @param separator string to go between names
\r
3937 * @return string of names
\r
3940 public static String getName(String s, String separator) {
\r
3941 if (s.length() == 1) { // handle common case
\r
3942 return getName(s.charAt(0));
\r
3945 StringBuffer sb = new StringBuffer();
\r
3946 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
\r
3947 cp = UTF16.charAt(s,i);
\r
3948 if (i != 0) sb.append(separator);
\r
3949 sb.append(UCharacter.getName(cp));
\r
3951 return sb.toString();
\r
3955 * Retrieve the earlier version 1.0 Unicode name of the argument code
\r
3956 * point, or null if the character is unassigned or outside the range
\r
3957 * UCharacter.MIN_VALUE and UCharacter.MAX_VALUE or does not have a name.
\r
3959 * Note calling any methods related to code point names, e.g. get*Name*()
\r
3960 * incurs a one-time initialisation cost to construct the name tables.
\r
3961 * @param ch the code point for which to get the name
\r
3962 * @return version 1.0 Unicode name
\r
3965 public static String getName1_0(int ch)
\r
3968 throw new MissingResourceException("Could not load unames.icu","","");
\r
3970 return NAME_.getName(ch,
\r
3971 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
3975 * <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
\r
3976 * getName1_0(int), this method will return a name even for codepoints that
\r
3977 * are not assigned a name in UnicodeData.txt.
\r
3979 * The names are returned in the following order.
\r
3981 * <li> Most current Unicode name if there is any
\r
3982 * <li> Unicode 1.0 name if there is any
\r
3983 * <li> Extended name in the form of
\r
3984 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-fffe>
\r
3986 * Note calling any methods related to code point names, e.g. get*Name*()
\r
3987 * incurs a one-time initialisation cost to construct the name tables.
\r
3988 * @param ch the code point for which to get the name
\r
3989 * @return a name for the argument codepoint
\r
3992 public static String getExtendedName(int ch)
\r
3995 throw new MissingResourceException("Could not load unames.icu","","");
\r
3997 return NAME_.getName(ch, UCharacterNameChoice.EXTENDED_CHAR_NAME);
\r
4001 * Get the ISO 10646 comment for a character.
\r
4002 * The ISO 10646 comment is an informative field in the Unicode Character
\r
4003 * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list.
\r
4004 * @param ch The code point for which to get the ISO comment.
\r
4005 * It must be <code>0<=c<=0x10ffff</code>.
\r
4006 * @return The ISO comment, or null if there is no comment for this
\r
4010 public static String getISOComment(int ch)
\r
4012 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
\r
4016 throw new MissingResourceException("Could not load unames.icu","","");
\r
4018 String result = NAME_.getGroupName(ch,
\r
4019 UCharacterNameChoice.ISO_COMMENT_);
\r
4024 * <p>Find a Unicode code point by its most current Unicode name and
\r
4025 * return its code point value. All Unicode names are in uppercase.</p>
\r
4026 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4027 * incurs a one-time initialisation cost to construct the name tables.
\r
4028 * @param name most current Unicode character name whose code point is to
\r
4030 * @return code point or -1 if name is not found
\r
4033 public static int getCharFromName(String name)
\r
4036 throw new MissingResourceException("Could not load unames.icu","","");
\r
4038 return NAME_.getCharFromName(
\r
4039 UCharacterNameChoice.UNICODE_CHAR_NAME, name);
\r
4043 * <p>Find a Unicode character by its version 1.0 Unicode name and return
\r
4044 * its code point value. All Unicode names are in uppercase.</p>
\r
4045 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4046 * incurs a one-time initialisation cost to construct the name tables.
\r
4047 * @param name Unicode 1.0 code point name whose code point is to
\r
4049 * @return code point or -1 if name is not found
\r
4052 public static int getCharFromName1_0(String name)
\r
4055 throw new MissingResourceException("Could not load unames.icu","","");
\r
4057 return NAME_.getCharFromName(
\r
4058 UCharacterNameChoice.UNICODE_10_CHAR_NAME, name);
\r
4062 * <p>Find a Unicode character by either its name and return its code
\r
4063 * point value. All Unicode names are in uppercase.
\r
4064 * Extended names are all lowercase except for numbers and are contained
\r
4065 * within angle brackets.</p>
\r
4066 * The names are searched in the following order
\r
4068 * <li> Most current Unicode name if there is any
\r
4069 * <li> Unicode 1.0 name if there is any
\r
4070 * <li> Extended name in the form of
\r
4071 * "<codepoint_type-codepoint_hex_digits>". E.g. <noncharacter-FFFE>
\r
4073 * Note calling any methods related to code point names, e.g. get*Name*()
\r
4074 * incurs a one-time initialisation cost to construct the name tables.
\r
4075 * @param name codepoint name
\r
4076 * @return code point associated with the name or -1 if the name is not
\r
4080 public static int getCharFromExtendedName(String name)
\r
4083 throw new MissingResourceException("Could not load unames.icu","","");
\r
4085 return NAME_.getCharFromName(
\r
4086 UCharacterNameChoice.EXTENDED_CHAR_NAME, name);
\r
4090 * Return the Unicode name for a given property, as given in the
\r
4091 * Unicode database file PropertyAliases.txt. Most properties
\r
4092 * have more than one name. The nameChoice determines which one
\r
4095 * In addition, this function maps the property
\r
4096 * UProperty.GENERAL_CATEGORY_MASK to the synthetic names "gcm" /
\r
4097 * "General_Category_Mask". These names are not in
\r
4098 * PropertyAliases.txt.
\r
4100 * @param property UProperty selector.
\r
4102 * @param nameChoice UProperty.NameChoice selector for which name
\r
4103 * to get. All properties have a long name. Most have a short
\r
4104 * name, but some do not. Unicode allows for additional names; if
\r
4105 * present these will be returned by UProperty.NameChoice.LONG + i,
\r
4106 * where i=1, 2,...
\r
4108 * @return a name, or null if Unicode explicitly defines no name
\r
4109 * ("n/a") for a given property/nameChoice. If a given nameChoice
\r
4110 * throws an exception, then all larger values of nameChoice will
\r
4111 * throw an exception. If null is returned for a given
\r
4112 * nameChoice, then other nameChoice values may return non-null
\r
4115 * @exception IllegalArgumentException thrown if property or
\r
4116 * nameChoice are invalid.
\r
4119 * @see UProperty.NameChoice
\r
4122 public static String getPropertyName(int property,
\r
4124 return PNAMES_.getPropertyName(property, nameChoice);
\r
4128 * Return the UProperty selector for a given property name, as
\r
4129 * specified in the Unicode database file PropertyAliases.txt.
\r
4130 * Short, long, and any other variants are recognized.
\r
4132 * In addition, this function maps the synthetic names "gcm" /
\r
4133 * "General_Category_Mask" to the property
\r
4134 * UProperty.GENERAL_CATEGORY_MASK. These names are not in
\r
4135 * PropertyAliases.txt.
\r
4137 * @param propertyAlias the property name to be matched. The name
\r
4138 * is compared using "loose matching" as described in
\r
4139 * PropertyAliases.txt.
\r
4141 * @return a UProperty enum.
\r
4143 * @exception IllegalArgumentException thrown if propertyAlias
\r
4144 * is not recognized.
\r
4149 public static int getPropertyEnum(String propertyAlias) {
\r
4150 return PNAMES_.getPropertyEnum(propertyAlias);
\r
4154 * Return the Unicode name for a given property value, as given in
\r
4155 * the Unicode database file PropertyValueAliases.txt. Most
\r
4156 * values have more than one name. The nameChoice determines
\r
4157 * which one is returned.
\r
4159 * Note: Some of the names in PropertyValueAliases.txt can only be
\r
4160 * retrieved using UProperty.GENERAL_CATEGORY_MASK, not
\r
4161 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
\r
4162 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
\r
4163 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
\r
4165 * @param property UProperty selector constant.
\r
4166 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
\r
4167 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
\r
4168 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
\r
4169 * If out of range, null is returned.
\r
4171 * @param value selector for a value for the given property. In
\r
4172 * general, valid values range from 0 up to some maximum. There
\r
4173 * are a few exceptions: (1.) UProperty.BLOCK values begin at the
\r
4174 * non-zero value BASIC_LATIN.getID(). (2.)
\r
4175 * UProperty.CANONICAL_COMBINING_CLASS values are not contiguous
\r
4176 * and range from 0..240. (3.) UProperty.GENERAL_CATEGORY_MASK values
\r
4177 * are mask values produced by left-shifting 1 by
\r
4178 * UCharacter.getType(). This allows grouped categories such as
\r
4179 * [:L:] to be represented. Mask values are non-contiguous.
\r
4181 * @param nameChoice UProperty.NameChoice selector for which name
\r
4182 * to get. All values have a long name. Most have a short name,
\r
4183 * but some do not. Unicode allows for additional names; if
\r
4184 * present these will be returned by UProperty.NameChoice.LONG + i,
\r
4185 * where i=1, 2,...
\r
4187 * @return a name, or null if Unicode explicitly defines no name
\r
4188 * ("n/a") for a given property/value/nameChoice. If a given
\r
4189 * nameChoice throws an exception, then all larger values of
\r
4190 * nameChoice will throw an exception. If null is returned for a
\r
4191 * given nameChoice, then other nameChoice values may return
\r
4192 * non-null results.
\r
4194 * @exception IllegalArgumentException thrown if property, value,
\r
4195 * or nameChoice are invalid.
\r
4198 * @see UProperty.NameChoice
\r
4201 public static String getPropertyValueName(int property,
\r
4205 if ((property == UProperty.CANONICAL_COMBINING_CLASS
\r
4206 || property == UProperty.LEAD_CANONICAL_COMBINING_CLASS
\r
4207 || property == UProperty.TRAIL_CANONICAL_COMBINING_CLASS)
\r
4208 && value >= UCharacter.getIntPropertyMinValue(
\r
4209 UProperty.CANONICAL_COMBINING_CLASS)
\r
4210 && value <= UCharacter.getIntPropertyMaxValue(
\r
4211 UProperty.CANONICAL_COMBINING_CLASS)
\r
4212 && nameChoice >= 0 && nameChoice < UProperty.NameChoice.COUNT) {
\r
4213 // this is hard coded for the valid cc
\r
4214 // because PropertyValueAliases.txt does not contain all of them
\r
4216 return PNAMES_.getPropertyValueName(property, value,
\r
4219 catch (IllegalArgumentException e) {
\r
4223 return PNAMES_.getPropertyValueName(property, value, nameChoice);
\r
4227 * Return the property value integer for a given value name, as
\r
4228 * specified in the Unicode database file PropertyValueAliases.txt.
\r
4229 * Short, long, and any other variants are recognized.
\r
4231 * Note: Some of the names in PropertyValueAliases.txt will only be
\r
4232 * recognized with UProperty.GENERAL_CATEGORY_MASK, not
\r
4233 * UProperty.GENERAL_CATEGORY. These include: "C" / "Other", "L" /
\r
4234 * "Letter", "LC" / "Cased_Letter", "M" / "Mark", "N" / "Number", "P"
\r
4235 * / "Punctuation", "S" / "Symbol", and "Z" / "Separator".
\r
4237 * @param property UProperty selector constant.
\r
4238 * UProperty.INT_START <= property < UProperty.INT_LIMIT or
\r
4239 * UProperty.BINARY_START <= property < UProperty.BINARY_LIMIT or
\r
4240 * UProperty.MASK_START < = property < UProperty.MASK_LIMIT.
\r
4241 * Only these properties can be enumerated.
\r
4243 * @param valueAlias the value name to be matched. The name is
\r
4244 * compared using "loose matching" as described in
\r
4245 * PropertyValueAliases.txt.
\r
4247 * @return a value integer. Note: UProperty.GENERAL_CATEGORY
\r
4248 * values are mask values produced by left-shifting 1 by
\r
4249 * UCharacter.getType(). This allows grouped categories such as
\r
4250 * [:L:] to be represented.
\r
4253 * @throws IllegalArgumentException if property is not a valid UProperty
\r
4257 public static int getPropertyValueEnum(int property, String valueAlias) {
\r
4258 return PNAMES_.getPropertyValueEnum(property, valueAlias);
\r
4262 * Returns a code point corresponding to the two UTF16 characters.
\r
4263 * @param lead the lead char
\r
4264 * @param trail the trail char
\r
4265 * @return code point if surrogate characters are valid.
\r
4266 * @exception IllegalArgumentException thrown when argument characters do
\r
4267 * not form a valid codepoint
\r
4270 public static int getCodePoint(char lead, char trail)
\r
4272 if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
\r
4273 return UCharacterProperty.getRawSupplementary(lead, trail);
\r
4275 throw new IllegalArgumentException("Illegal surrogate characters");
\r
4279 * Returns the code point corresponding to the UTF16 character.
\r
4280 * @param char16 the UTF16 character
\r
4281 * @return code point if argument is a valid character.
\r
4282 * @exception IllegalArgumentException thrown when char16 is not a valid
\r
4286 public static int getCodePoint(char char16)
\r
4288 if (UCharacter.isLegal(char16)) {
\r
4291 throw new IllegalArgumentException("Illegal codepoint");
\r
4295 * Implementation of UCaseProps.ContextIterator, iterates over a String.
\r
4296 * See ustrcase.c/utf16_caseContextIterator().
\r
4298 private static class StringContextIterator implements UCaseProps.ContextIterator {
\r
4301 * @param s String to iterate over.
\r
4303 StringContextIterator(String s) {
\r
4306 cpStart=cpLimit=index=0;
\r
4311 * Set the iteration limit for nextCaseMapCP() to an index within the string.
\r
4312 * If the limit parameter is negative or past the string, then the
\r
4313 * string length is restored as the iteration limit.
\r
4315 * This limit does not affect the next() function which always
\r
4316 * iterates to the very end of the string.
\r
4318 * @param lim The iteration limit.
\r
4320 public void setLimit(int lim) {
\r
4321 if(0<=lim && lim<=s.length()) {
\r
4329 * Move to the iteration limit without fetching code points up to there.
\r
4331 public void moveToLimit() {
\r
4332 cpStart=cpLimit=limit;
\r
4336 * Iterate forward through the string to fetch the next code point
\r
4337 * to be case-mapped, and set the context indexes for it.
\r
4338 * Performance optimization, to save on function calls and redundant
\r
4339 * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
\r
4341 * When the iteration limit is reached (and -1 is returned),
\r
4342 * getCPStart() will be at the iteration limit.
\r
4344 * Iteration with next() does not affect the position for nextCaseMapCP().
\r
4346 * @return The next code point to be case-mapped, or <0 when the iteration is done.
\r
4348 public int nextCaseMapCP() {
\r
4350 if(cpLimit<limit) {
\r
4351 int c=s.charAt(cpLimit++);
\r
4352 if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
\r
4354 if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
\r
4355 UTF16.TRAIL_SURROGATE_MIN_VALUE<=(c2=s.charAt(cpLimit)) && c2<=UTF16.TRAIL_SURROGATE_MAX_VALUE
\r
4357 // supplementary code point
\r
4359 c=UCharacterProperty.getRawSupplementary((char)c, c2);
\r
4360 // else unpaired surrogate code point
\r
4362 // else BMP code point
\r
4371 * Get the start of the code point that was last returned
\r
4372 * by nextCaseMapCP().
\r
4374 public int getCPStart() {
\r
4379 * Get the limit of the code point that was last returned
\r
4380 * by nextCaseMapCP().
\r
4382 public int getCPLimit() {
\r
4386 // implement UCaseProps.ContextIterator
\r
4387 public void reset(int direction) {
\r
4389 /* reset for forward iteration */
\r
4392 } else if(direction<0) {
\r
4393 /* reset for backward iteration */
\r
4397 // not a valid direction
\r
4403 public int next() {
\r
4406 if(dir>0 && index<s.length()) {
\r
4407 c=UTF16.charAt(s, index);
\r
4408 index+=UTF16.getCharCount(c);
\r
4410 } else if(dir<0 && index>0) {
\r
4411 c=UTF16.charAt(s, index-1);
\r
4412 index-=UTF16.getCharCount(c);
\r
4419 protected String s;
\r
4420 protected int index, limit, cpStart, cpLimit;
\r
4421 protected int dir; // 0=initial state >0=forward <0=backward
\r
4425 * Gets uppercase version of the argument string.
\r
4426 * Casing is dependent on the default locale and context-sensitive.
\r
4427 * @param str source string to be performed on
\r
4428 * @return uppercase version of the argument string
\r
4431 public static String toUpperCase(String str)
\r
4433 return toUpperCase(ULocale.getDefault(), str);
\r
4437 * Gets lowercase version of the argument string.
\r
4438 * Casing is dependent on the default locale and context-sensitive
\r
4439 * @param str source string to be performed on
\r
4440 * @return lowercase version of the argument string
\r
4443 public static String toLowerCase(String str)
\r
4445 return toLowerCase(ULocale.getDefault(), str);
\r
4449 * <p>Gets the titlecase version of the argument string.</p>
\r
4450 * <p>Position for titlecasing is determined by the argument break
\r
4451 * iterator, hence the user can customize his break iterator for
\r
4452 * a specialized titlecasing. In this case only the forward iteration
\r
4453 * needs to be implemented.
\r
4454 * If the break iterator passed in is null, the default Unicode algorithm
\r
4455 * will be used to determine the titlecase positions.
\r
4457 * <p>Only positions returned by the break iterator will be title cased,
\r
4458 * character in between the positions will all be in lower case.</p>
\r
4459 * <p>Casing is dependent on the default locale and context-sensitive</p>
\r
4460 * @param str source string to be performed on
\r
4461 * @param breakiter break iterator to determine the positions in which
\r
4462 * the character should be title cased.
\r
4463 * @return lowercase version of the argument string
\r
4466 public static String toTitleCase(String str, BreakIterator breakiter)
\r
4468 return toTitleCase(ULocale.getDefault(), str, breakiter);
\r
4472 * Gets uppercase version of the argument string.
\r
4473 * Casing is dependent on the argument locale and context-sensitive.
\r
4474 * @param locale which string is to be converted in
\r
4475 * @param str source string to be performed on
\r
4476 * @return uppercase version of the argument string
\r
4479 public static String toUpperCase(Locale locale, String str)
\r
4481 return toUpperCase(ULocale.forLocale(locale), str);
\r
4485 * Gets uppercase version of the argument string.
\r
4486 * Casing is dependent on the argument locale and context-sensitive.
\r
4487 * @param locale which string is to be converted in
\r
4488 * @param str source string to be performed on
\r
4489 * @return uppercase version of the argument string
\r
4492 public static String toUpperCase(ULocale locale, String str) {
\r
4493 StringContextIterator iter = new StringContextIterator(str);
\r
4494 StringBuffer result = new StringBuffer(str.length());
\r
4495 int[] locCache = new int[1];
\r
4498 if (locale == null) {
\r
4499 locale = ULocale.getDefault();
\r
4503 while((c=iter.nextCaseMapCP())>=0) {
\r
4504 c=gCsp.toFullUpper(c, iter, result, locale, locCache);
\r
4506 /* decode the result */
\r
4508 /* (not) original code point */
\r
4510 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4511 /* mapping already appended to result */
\r
4513 /* } else { append single-code point mapping */
\r
4516 result.append((char)c);
\r
4518 UTF16.append(result, c);
\r
4521 return result.toString();
\r
4525 * Gets lowercase version of the argument string.
\r
4526 * Casing is dependent on the argument locale and context-sensitive
\r
4527 * @param locale which string is to be converted in
\r
4528 * @param str source string to be performed on
\r
4529 * @return lowercase version of the argument string
\r
4532 public static String toLowerCase(Locale locale, String str)
\r
4534 return toLowerCase(ULocale.forLocale(locale), str);
\r
4538 * Gets lowercase version of the argument string.
\r
4539 * Casing is dependent on the argument locale and context-sensitive
\r
4540 * @param locale which string is to be converted in
\r
4541 * @param str source string to be performed on
\r
4542 * @return lowercase version of the argument string
\r
4545 public static String toLowerCase(ULocale locale, String str) {
\r
4546 StringContextIterator iter = new StringContextIterator(str);
\r
4547 StringBuffer result = new StringBuffer(str.length());
\r
4548 int[] locCache = new int[1];
\r
4551 if (locale == null) {
\r
4552 locale = ULocale.getDefault();
\r
4556 while((c=iter.nextCaseMapCP())>=0) {
\r
4557 c=gCsp.toFullLower(c, iter, result, locale, locCache);
\r
4559 /* decode the result */
\r
4561 /* (not) original code point */
\r
4563 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4564 /* mapping already appended to result */
\r
4566 /* } else { append single-code point mapping */
\r
4569 result.append((char)c);
\r
4571 UTF16.append(result, c);
\r
4574 return result.toString();
\r
4578 * <p>Gets the titlecase version of the argument string.</p>
\r
4579 * <p>Position for titlecasing is determined by the argument break
\r
4580 * iterator, hence the user can customize his break iterator for
\r
4581 * a specialized titlecasing. In this case only the forward iteration
\r
4582 * needs to be implemented.
\r
4583 * If the break iterator passed in is null, the default Unicode algorithm
\r
4584 * will be used to determine the titlecase positions.
\r
4586 * <p>Only positions returned by the break iterator will be title cased,
\r
4587 * character in between the positions will all be in lower case.</p>
\r
4588 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4589 * @param locale which string is to be converted in
\r
4590 * @param str source string to be performed on
\r
4591 * @param breakiter break iterator to determine the positions in which
\r
4592 * the character should be title cased.
\r
4593 * @return lowercase version of the argument string
\r
4596 public static String toTitleCase(Locale locale, String str,
\r
4597 BreakIterator breakiter)
\r
4599 return toTitleCase(ULocale.forLocale(locale), str, breakiter);
\r
4603 * <p>Gets the titlecase version of the argument string.</p>
\r
4604 * <p>Position for titlecasing is determined by the argument break
\r
4605 * iterator, hence the user can customize his break iterator for
\r
4606 * a specialized titlecasing. In this case only the forward iteration
\r
4607 * needs to be implemented.
\r
4608 * If the break iterator passed in is null, the default Unicode algorithm
\r
4609 * will be used to determine the titlecase positions.
\r
4611 * <p>Only positions returned by the break iterator will be title cased,
\r
4612 * character in between the positions will all be in lower case.</p>
\r
4613 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4614 * @param locale which string is to be converted in
\r
4615 * @param str source string to be performed on
\r
4616 * @param titleIter break iterator to determine the positions in which
\r
4617 * the character should be title cased.
\r
4618 * @return lowercase version of the argument string
\r
4621 public static String toTitleCase(ULocale locale, String str,
\r
4622 BreakIterator titleIter) {
\r
4623 return toTitleCase(locale, str, titleIter, 0);
\r
4627 * <p>Gets the titlecase version of the argument string.</p>
\r
4628 * <p>Position for titlecasing is determined by the argument break
\r
4629 * iterator, hence the user can customize his break iterator for
\r
4630 * a specialized titlecasing. In this case only the forward iteration
\r
4631 * needs to be implemented.
\r
4632 * If the break iterator passed in is null, the default Unicode algorithm
\r
4633 * will be used to determine the titlecase positions.
\r
4635 * <p>Only positions returned by the break iterator will be title cased,
\r
4636 * character in between the positions will all be in lower case.</p>
\r
4637 * <p>Casing is dependent on the argument locale and context-sensitive</p>
\r
4638 * @param locale which string is to be converted in
\r
4639 * @param str source string to be performed on
\r
4640 * @param titleIter break iterator to determine the positions in which
\r
4641 * the character should be title cased.
\r
4642 * @param options bit set to modify the titlecasing operation
\r
4643 * @return lowercase version of the argument string
\r
4645 * @see #TITLECASE_NO_LOWERCASE
\r
4646 * @see #TITLECASE_NO_BREAK_ADJUSTMENT
\r
4648 public static String toTitleCase(ULocale locale, String str,
\r
4649 BreakIterator titleIter,
\r
4651 StringContextIterator iter = new StringContextIterator(str);
\r
4652 StringBuffer result = new StringBuffer(str.length());
\r
4653 int[] locCache = new int[1];
\r
4654 int c, nc, srcLength = str.length();
\r
4656 if (locale == null) {
\r
4657 locale = ULocale.getDefault();
\r
4661 if(titleIter == null) {
\r
4662 titleIter = BreakIterator.getWordInstance(locale);
\r
4664 titleIter.setText(str);
\r
4666 int prev, titleStart, index;
\r
4667 boolean isFirstIndex;
\r
4668 boolean isDutch = locale.getLanguage().equals("nl");
\r
4669 boolean FirstIJ = true;
\r
4671 /* set up local variables */
\r
4673 isFirstIndex=true;
\r
4675 /* titlecasing loop */
\r
4676 while(prev<srcLength) {
\r
4677 /* find next index where to titlecase */
\r
4678 if(isFirstIndex) {
\r
4679 isFirstIndex=false;
\r
4680 index=titleIter.first();
\r
4682 index=titleIter.next();
\r
4684 if(index==BreakIterator.DONE || index>srcLength) {
\r
4689 * Unicode 4 & 5 section 3.13 Default Case Operations:
\r
4691 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
\r
4692 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
\r
4693 * cased character F. If F exists, map F to default_title(F); then map each
\r
4694 * subsequent character C to default_lower(C).
\r
4696 * In this implementation, segment [prev..index[ into 3 parts:
\r
4697 * a) uncased characters (copy as-is) [prev..titleStart[
\r
4698 * b) first case letter (titlecase) [titleStart..titleLimit[
\r
4699 * c) subsequent characters (lowercase) [titleLimit..index[
\r
4702 /* find and copy uncased characters [prev..titleStart[ */
\r
4703 iter.setLimit(index);
\r
4704 c=iter.nextCaseMapCP();
\r
4705 if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCaseProps.NONE==gCsp.getType(c)) {
\r
4706 while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
\r
4707 titleStart=iter.getCPStart();
\r
4708 if(prev<titleStart) {
\r
4709 // TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
\r
4710 result.append(str.substring(prev, titleStart));
\r
4716 if(titleStart<index) {
\r
4718 /* titlecase c which is from titleStart */
\r
4719 c=gCsp.toFullTitle(c, iter, result, locale, locCache);
\r
4721 /* decode the result and lowercase up to index */
\r
4724 /* (not) original code point */
\r
4727 result.append((char)c);
\r
4729 UTF16.append(result, c);
\r
4731 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4732 /* mapping already appended to result */
\r
4734 /* append single-code point mapping */
\r
4736 result.append((char)c);
\r
4738 UTF16.append(result, c);
\r
4742 if((options&TITLECASE_NO_LOWERCASE)!=0) {
\r
4743 /* Optionally just copy the rest of the word unchanged. */
\r
4745 int titleLimit=iter.getCPLimit();
\r
4746 if(titleLimit<index) {
\r
4747 // TODO: With Java 5, this would want to be result.append(str, titleLimit, index);
\r
4748 String appendStr = str.substring(titleLimit,index);
\r
4749 /* Special Case - Dutch IJ Titlecasing */
\r
4750 if ( isDutch && c == 0x0049 && appendStr.startsWith("j")) {
\r
4751 appendStr = "J" + appendStr.substring(1);
\r
4753 result.append(appendStr);
\r
4754 iter.moveToLimit();
\r
4757 } else if((nc=iter.nextCaseMapCP())>=0) {
\r
4758 if ( isDutch && ( nc == 0x004A || nc == 0x006A ) && ( c == 0x0049 ) && ( FirstIJ == true )) {
\r
4759 c = 0x004A; /* J */
\r
4762 /* Normal operation: Lowercase the rest of the word. */
\r
4763 c=gCsp.toFullLower(nc, iter, result, locale, locCache);
\r
4774 return result.toString();
\r
4778 * The given character is mapped to its case folding equivalent according
\r
4779 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
\r
4780 * folding equivalent, the character itself is returned.
\r
4782 * <p>This function only returns the simple, single-code point case mapping.
\r
4783 * Full case mappings should be used whenever possible because they produce
\r
4784 * better results by working on whole strings.
\r
4785 * They can map to a result string with a different length as appropriate.
\r
4786 * Full case mappings are applied by the case mapping functions
\r
4787 * that take String parameters rather than code points (int).
\r
4788 * See also the User Guide chapter on C/POSIX migration:
\r
4789 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
4791 * @param ch the character to be converted
\r
4792 * @param defaultmapping Indicates if all mappings defined in
\r
4793 * CaseFolding.txt is to be used, otherwise the
\r
4794 * mappings for dotted I and dotless i marked with
\r
4795 * 'I' in CaseFolding.txt will be skipped.
\r
4796 * @return the case folding equivalent of the character, if
\r
4797 * any; otherwise the character itself.
\r
4798 * @see #foldCase(String, boolean)
\r
4801 public static int foldCase(int ch, boolean defaultmapping) {
\r
4802 return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
4806 * The given string is mapped to its case folding equivalent according to
\r
4807 * UnicodeData.txt and CaseFolding.txt; if any character has no case
\r
4808 * folding equivalent, the character itself is returned.
\r
4809 * "Full", multiple-code point case folding mappings are returned here.
\r
4810 * For "simple" single-code point mappings use the API
\r
4811 * foldCase(int ch, boolean defaultmapping).
\r
4812 * @param str the String to be converted
\r
4813 * @param defaultmapping Indicates if all mappings defined in
\r
4814 * CaseFolding.txt is to be used, otherwise the
\r
4815 * mappings for dotted I and dotless i marked with
\r
4816 * 'I' in CaseFolding.txt will be skipped.
\r
4817 * @return the case folding equivalent of the character, if
\r
4818 * any; otherwise the character itself.
\r
4819 * @see #foldCase(int, boolean)
\r
4822 public static String foldCase(String str, boolean defaultmapping) {
\r
4823 return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I);
\r
4827 * Option value for case folding: use default mappings defined in CaseFolding.txt.
\r
4830 public static final int FOLD_CASE_DEFAULT = 0x0000;
\r
4832 * Option value for case folding: exclude the mappings for dotted I
\r
4833 * and dotless i marked with 'I' in CaseFolding.txt.
\r
4836 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001;
\r
4839 * The given character is mapped to its case folding equivalent according
\r
4840 * to UnicodeData.txt and CaseFolding.txt; if the character has no case
\r
4841 * folding equivalent, the character itself is returned.
\r
4843 * <p>This function only returns the simple, single-code point case mapping.
\r
4844 * Full case mappings should be used whenever possible because they produce
\r
4845 * better results by working on whole strings.
\r
4846 * They can map to a result string with a different length as appropriate.
\r
4847 * Full case mappings are applied by the case mapping functions
\r
4848 * that take String parameters rather than code points (int).
\r
4849 * See also the User Guide chapter on C/POSIX migration:
\r
4850 * http://www.icu-project.org/userguide/posix.html#case_mappings
\r
4852 * @param ch the character to be converted
\r
4853 * @param options A bit set for special processing. Currently the recognised options are
\r
4854 * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
\r
4855 * @return the case folding equivalent of the character, if
\r
4856 * any; otherwise the character itself.
\r
4857 * @see #foldCase(String, boolean)
\r
4860 public static int foldCase(int ch, int options) {
\r
4861 return gCsp.fold(ch, options);
\r
4865 * The given string is mapped to its case folding equivalent according to
\r
4866 * UnicodeData.txt and CaseFolding.txt; if any character has no case
\r
4867 * folding equivalent, the character itself is returned.
\r
4868 * "Full", multiple-code point case folding mappings are returned here.
\r
4869 * For "simple" single-code point mappings use the API
\r
4870 * foldCase(int ch, boolean defaultmapping).
\r
4871 * @param str the String to be converted
\r
4872 * @param options A bit set for special processing. Currently the recognised options are
\r
4873 * FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
\r
4874 * @return the case folding equivalent of the character, if
\r
4875 * any; otherwise the character itself.
\r
4876 * @see #foldCase(int, boolean)
\r
4879 public static final String foldCase(String str, int options) {
\r
4880 StringBuffer result = new StringBuffer(str.length());
\r
4883 length = str.length();
\r
4884 for(i=0; i<length;) {
\r
4885 c=UTF16.charAt(str, i);
\r
4886 i+=UTF16.getCharCount(c);
\r
4887 c=gCsp.toFullFolding(c, result, options);
\r
4889 /* decode the result */
\r
4891 /* (not) original code point */
\r
4893 } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
\r
4894 /* mapping already appended to result */
\r
4896 /* } else { append single-code point mapping */
\r
4899 result.append((char)c);
\r
4901 UTF16.append(result, c);
\r
4904 return result.toString();
\r
4908 * Return numeric value of Han code points.
\r
4909 * <br> This returns the value of Han 'numeric' code points,
\r
4910 * including those for zero, ten, hundred, thousand, ten thousand,
\r
4911 * and hundred million.
\r
4912 * This includes both the standard and 'checkwriting'
\r
4913 * characters, the 'big circle' zero character, and the standard
\r
4915 * @param ch code point to query
\r
4916 * @return value if it is a Han 'numeric character,' otherwise return -1.
\r
4919 public static int getHanNumericValue(int ch)
\r
4921 // TODO: Are these all covered by Unicode numeric value data?
\r
4924 case IDEOGRAPHIC_NUMBER_ZERO_ :
\r
4925 case CJK_IDEOGRAPH_COMPLEX_ZERO_ :
\r
4926 return 0; // Han Zero
\r
4927 case CJK_IDEOGRAPH_FIRST_ :
\r
4928 case CJK_IDEOGRAPH_COMPLEX_ONE_ :
\r
4929 return 1; // Han One
\r
4930 case CJK_IDEOGRAPH_SECOND_ :
\r
4931 case CJK_IDEOGRAPH_COMPLEX_TWO_ :
\r
4932 return 2; // Han Two
\r
4933 case CJK_IDEOGRAPH_THIRD_ :
\r
4934 case CJK_IDEOGRAPH_COMPLEX_THREE_ :
\r
4935 return 3; // Han Three
\r
4936 case CJK_IDEOGRAPH_FOURTH_ :
\r
4937 case CJK_IDEOGRAPH_COMPLEX_FOUR_ :
\r
4938 return 4; // Han Four
\r
4939 case CJK_IDEOGRAPH_FIFTH_ :
\r
4940 case CJK_IDEOGRAPH_COMPLEX_FIVE_ :
\r
4941 return 5; // Han Five
\r
4942 case CJK_IDEOGRAPH_SIXTH_ :
\r
4943 case CJK_IDEOGRAPH_COMPLEX_SIX_ :
\r
4944 return 6; // Han Six
\r
4945 case CJK_IDEOGRAPH_SEVENTH_ :
\r
4946 case CJK_IDEOGRAPH_COMPLEX_SEVEN_ :
\r
4947 return 7; // Han Seven
\r
4948 case CJK_IDEOGRAPH_EIGHTH_ :
\r
4949 case CJK_IDEOGRAPH_COMPLEX_EIGHT_ :
\r
4950 return 8; // Han Eight
\r
4951 case CJK_IDEOGRAPH_NINETH_ :
\r
4952 case CJK_IDEOGRAPH_COMPLEX_NINE_ :
\r
4953 return 9; // Han Nine
\r
4954 case CJK_IDEOGRAPH_TEN_ :
\r
4955 case CJK_IDEOGRAPH_COMPLEX_TEN_ :
\r
4957 case CJK_IDEOGRAPH_HUNDRED_ :
\r
4958 case CJK_IDEOGRAPH_COMPLEX_HUNDRED_ :
\r
4960 case CJK_IDEOGRAPH_THOUSAND_ :
\r
4961 case CJK_IDEOGRAPH_COMPLEX_THOUSAND_ :
\r
4963 case CJK_IDEOGRAPH_TEN_THOUSAND_ :
\r
4965 case CJK_IDEOGRAPH_HUNDRED_MILLION_ :
\r
4968 return -1; // no value
\r
4972 * <p>Gets an iterator for character types, iterating over codepoints.</p>
\r
4973 * Example of use:<br>
\r
4975 * RangeValueIterator iterator = UCharacter.getTypeIterator();
\r
4976 * RangeValueIterator.Element element = new RangeValueIterator.Element();
\r
4977 * while (iterator.next(element)) {
\r
4978 * System.out.println("Codepoint \\u" +
\r
4979 * Integer.toHexString(element.start) +
\r
4980 * " to codepoint \\u" +
\r
4981 * Integer.toHexString(element.limit - 1) +
\r
4982 * " has the character type " +
\r
4986 * @return an iterator
\r
4989 public static RangeValueIterator getTypeIterator()
\r
4991 return new UCharacterTypeIterator(PROPERTY_);
\r
4995 * <p>Gets an iterator for character names, iterating over codepoints.</p>
\r
4996 * <p>This API only gets the iterator for the modern, most up-to-date
\r
4997 * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
\r
4998 * for extended names use getExtendedNameIterator().</p>
\r
4999 * Example of use:<br>
\r
5001 * ValueIterator iterator = UCharacter.getNameIterator();
\r
5002 * ValueIterator.Element element = new ValueIterator.Element();
\r
5003 * while (iterator.next(element)) {
\r
5004 * System.out.println("Codepoint \\u" +
\r
5005 * Integer.toHexString(element.codepoint) +
\r
5006 * " has the name " + (String)element.value);
\r
5009 * <p>The maximal range which the name iterator iterates is from
\r
5010 * UCharacter.MIN_VALUE to UCharacter.MAX_VALUE.</p>
\r
5011 * @return an iterator
\r
5014 public static ValueIterator getNameIterator()
\r
5017 throw new RuntimeException("Could not load unames.icu");
\r
5019 return new UCharacterNameIterator(NAME_,
\r
5020 UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
5024 * <p>Gets an iterator for character names, iterating over codepoints.</p>
\r
5025 * <p>This API only gets the iterator for the older 1.0 Unicode names.
\r
5026 * For modern, most up-to-date Unicode names use getNameIterator() or
\r
5027 * for extended names use getExtendedNameIterator().</p>
\r
5028 * Example of use:<br>
\r
5030 * ValueIterator iterator = UCharacter.get1_0NameIterator();
\r
5031 * ValueIterator.Element element = new ValueIterator.Element();
\r
5032 * while (iterator.next(element)) {
\r
5033 * System.out.println("Codepoint \\u" +
\r
5034 * Integer.toHexString(element.codepoint) +
\r
5035 * " has the name " + (String)element.value);
\r
5038 * <p>The maximal range which the name iterator iterates is from
\r
5039 * @return an iterator
\r
5042 public static ValueIterator getName1_0Iterator()
\r
5045 throw new RuntimeException("Could not load unames.icu");
\r
5047 return new UCharacterNameIterator(NAME_,
\r
5048 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
5052 * <p>Gets an iterator for character names, iterating over codepoints.</p>
\r
5053 * <p>This API only gets the iterator for the extended names.
\r
5054 * For modern, most up-to-date Unicode names use getNameIterator() or
\r
5055 * for older 1.0 Unicode names use get1_0NameIterator().</p>
\r
5056 * Example of use:<br>
\r
5058 * ValueIterator iterator = UCharacter.getExtendedNameIterator();
\r
5059 * ValueIterator.Element element = new ValueIterator.Element();
\r
5060 * while (iterator.next(element)) {
\r
5061 * System.out.println("Codepoint \\u" +
\r
5062 * Integer.toHexString(element.codepoint) +
\r
5063 * " has the name " + (String)element.value);
\r
5066 * <p>The maximal range which the name iterator iterates is from
\r
5067 * @return an iterator
\r
5070 public static ValueIterator getExtendedNameIterator()
\r
5073 throw new MissingResourceException("Could not load unames.icu","","");
\r
5075 return new UCharacterNameIterator(NAME_,
\r
5076 UCharacterNameChoice.EXTENDED_CHAR_NAME);
\r
5080 * <p>Get the "age" of the code point.</p>
\r
5081 * <p>The "age" is the Unicode version when the code point was first
\r
5082 * designated (as a non-character or for Private Use) or assigned a
\r
5084 * <p>This can be useful to avoid emitting code points to receiving
\r
5085 * processes that do not accept newer characters.</p>
\r
5086 * <p>The data is from the UCD file DerivedAge.txt.</p>
\r
5087 * @param ch The code point.
\r
5088 * @return the Unicode version number
\r
5091 public static VersionInfo getAge(int ch)
\r
5093 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
5094 throw new IllegalArgumentException("Codepoint out of bounds");
\r
5096 return PROPERTY_.getAge(ch);
\r
5100 * <p>Check a binary Unicode property for a code point.</p>
\r
5101 * <p>Unicode, especially in version 3.2, defines many more properties
\r
5102 * than the original set in UnicodeData.txt.</p>
\r
5103 * <p>This API is intended to reflect Unicode properties as defined in
\r
5104 * the Unicode Character Database (UCD) and Unicode Technical Reports
\r
5106 * <p>For details about the properties see
\r
5107 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
\r
5108 * <p>For names of Unicode properties see the UCD file
\r
5109 * PropertyAliases.txt.</p>
\r
5110 * <p>This API does not check the validity of the codepoint.</p>
\r
5111 * <p>Important: If ICU is built with UCD files from Unicode versions
\r
5112 * below 3.2, then properties marked with "new" are not or
\r
5113 * not fully available.</p>
\r
5114 * @param ch code point to test.
\r
5115 * @param property selector constant from com.ibm.icu.lang.UProperty,
\r
5116 * identifies which binary property to check.
\r
5117 * @return true or false according to the binary Unicode property value
\r
5118 * for ch. Also false if property is out of bounds or if the
\r
5119 * Unicode version does not have data for the property at all, or
\r
5120 * not for this code point.
\r
5121 * @see com.ibm.icu.lang.UProperty
\r
5124 public static boolean hasBinaryProperty(int ch, int property)
\r
5126 if (ch < MIN_VALUE || ch > MAX_VALUE) {
\r
5127 throw new IllegalArgumentException("Codepoint out of bounds");
\r
5129 return PROPERTY_.hasBinaryProperty(ch, property);
\r
5133 * <p>Check if a code point has the Alphabetic Unicode property.</p>
\r
5134 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC).</p>
\r
5135 * <p>Different from UCharacter.isLetter(ch)!</p>
\r
5137 * @param ch codepoint to be tested
\r
5139 public static boolean isUAlphabetic(int ch)
\r
5141 return hasBinaryProperty(ch, UProperty.ALPHABETIC);
\r
5145 * <p>Check if a code point has the Lowercase Unicode property.</p>
\r
5146 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE).</p>
\r
5147 * <p>This is different from UCharacter.isLowerCase(ch)!</p>
\r
5148 * @param ch codepoint to be tested
\r
5151 public static boolean isULowercase(int ch)
\r
5153 return hasBinaryProperty(ch, UProperty.LOWERCASE);
\r
5157 * <p>Check if a code point has the Uppercase Unicode property.</p>
\r
5158 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE).</p>
\r
5159 * <p>This is different from UCharacter.isUpperCase(ch)!</p>
\r
5160 * @param ch codepoint to be tested
\r
5163 public static boolean isUUppercase(int ch)
\r
5165 return hasBinaryProperty(ch, UProperty.UPPERCASE);
\r
5169 * <p>Check if a code point has the White_Space Unicode property.</p>
\r
5170 * <p>Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE).</p>
\r
5171 * <p>This is different from both UCharacter.isSpace(ch) and
\r
5172 * UCharacter.isWhitespace(ch)!</p>
\r
5173 * @param ch codepoint to be tested
\r
5176 public static boolean isUWhiteSpace(int ch)
\r
5178 return hasBinaryProperty(ch, UProperty.WHITE_SPACE);
\r
5183 * <p>Gets the property value for an Unicode property type of a code point.
\r
5184 * Also returns binary and mask property values.</p>
\r
5185 * <p>Unicode, especially in version 3.2, defines many more properties than
\r
5186 * the original set in UnicodeData.txt.</p>
\r
5187 * <p>The properties APIs are intended to reflect Unicode properties as
\r
5188 * defined in the Unicode Character Database (UCD) and Unicode Technical
\r
5189 * Reports (UTR). For details about the properties see
\r
5190 * http://www.unicode.org/.</p>
\r
5191 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
\r
5195 * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
\r
5196 * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
\r
5197 * boolean b = (ideo == 1) ? true : false;
\r
5199 * @param ch code point to test.
\r
5200 * @param type UProperty selector constant, identifies which binary
\r
5201 * property to check. Must be
\r
5202 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5203 * UProperty.INT_START <= type < UProperty.INT_LIMIT or
\r
5204 * UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
\r
5205 * @return numeric value that is directly the property value or,
\r
5206 * for enumerated properties, corresponds to the numeric value of
\r
5207 * the enumerated constant of the respective property value
\r
5208 * enumeration type (cast to enum type if necessary).
\r
5209 * Returns 0 or 1 (for false / true) for binary Unicode properties.
\r
5210 * Returns a bit-mask for mask properties.
\r
5211 * Returns 0 if 'type' is out of bounds or if the Unicode version
\r
5212 * does not have data for the property at all, or not for this code
\r
5215 * @see #hasBinaryProperty
\r
5216 * @see #getIntPropertyMinValue
\r
5217 * @see #getIntPropertyMaxValue
\r
5218 * @see #getUnicodeVersion
\r
5221 public static int getIntPropertyValue(int ch, int type)
\r
5223 if (type < UProperty.BINARY_START) {
\r
5224 return 0; // undefined
\r
5226 else if (type < UProperty.BINARY_LIMIT) {
\r
5227 return hasBinaryProperty(ch, type) ? 1 : 0;
\r
5229 else if (type < UProperty.INT_START) {
\r
5230 return 0; // undefined
\r
5232 else if (type < UProperty.INT_LIMIT) {
\r
5235 case UProperty.BIDI_CLASS:
\r
5236 return getDirection(ch);
\r
5237 case UProperty.BLOCK:
\r
5238 return UnicodeBlock.idOf(ch);
\r
5239 case UProperty.CANONICAL_COMBINING_CLASS:
\r
5240 return getCombiningClass(ch);
\r
5241 case UProperty.DECOMPOSITION_TYPE:
\r
5242 return PROPERTY_.getAdditional(ch, 2)
\r
5243 & DECOMPOSITION_TYPE_MASK_;
\r
5244 case UProperty.EAST_ASIAN_WIDTH:
\r
5245 return (PROPERTY_.getAdditional(ch, 0)
\r
5246 & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
\r
5247 case UProperty.GENERAL_CATEGORY:
\r
5248 return getType(ch);
\r
5249 case UProperty.JOINING_GROUP:
\r
5250 return gBdp.getJoiningGroup(ch);
\r
5251 case UProperty.JOINING_TYPE:
\r
5252 return gBdp.getJoiningType(ch);
\r
5253 case UProperty.LINE_BREAK:
\r
5254 return (int)(PROPERTY_.getAdditional(ch, LB_VWORD)& LB_MASK)>>LB_SHIFT;
\r
5255 case UProperty.NUMERIC_TYPE:
\r
5256 type=getNumericType(PROPERTY_.getProperty(ch));
\r
5257 if(type>NumericType.NUMERIC) {
\r
5258 /* keep internal variants of NumericType.NUMERIC from becoming visible */
\r
5259 type=NumericType.NUMERIC;
\r
5262 case UProperty.SCRIPT:
\r
5263 return UScript.getScript(ch);
\r
5264 case UProperty.HANGUL_SYLLABLE_TYPE:
\r
5265 /* purely algorithmic; hardcode known characters, check for assigned new ones */
\r
5266 if(ch<NormalizerImpl.JAMO_L_BASE) {
\r
5268 } else if(ch<=0x11ff) {
\r
5271 /* Jamo L range, HANGUL CHOSEONG ... */
\r
5272 if(ch==0x115f || ch<=0x1159 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
\r
5273 return HangulSyllableType.LEADING_JAMO;
\r
5275 } else if(ch<=0x11a7) {
\r
5276 /* Jamo V range, HANGUL JUNGSEONG ... */
\r
5277 if(ch<=0x11a2 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
\r
5278 return HangulSyllableType.VOWEL_JAMO;
\r
5281 /* Jamo T range */
\r
5282 if(ch<=0x11f9 || getType(ch)==UCharacterCategory.OTHER_LETTER) {
\r
5283 return HangulSyllableType.TRAILING_JAMO;
\r
5286 } else if((ch-=NormalizerImpl.HANGUL_BASE)<0) {
\r
5288 } else if(ch<NormalizerImpl.HANGUL_COUNT) {
\r
5289 /* Hangul syllable */
\r
5290 return ch%NormalizerImpl.JAMO_T_COUNT==0 ? HangulSyllableType.LV_SYLLABLE : HangulSyllableType.LVT_SYLLABLE;
\r
5292 return 0; /* NA */
\r
5294 case UProperty.NFD_QUICK_CHECK:
\r
5295 case UProperty.NFKD_QUICK_CHECK:
\r
5296 case UProperty.NFC_QUICK_CHECK:
\r
5297 case UProperty.NFKC_QUICK_CHECK:
\r
5298 return NormalizerImpl.quickCheck(ch, (type-UProperty.NFD_QUICK_CHECK)+2); // 2=UNORM_NFD
\r
5299 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
\r
5300 return NormalizerImpl.getFCD16(ch)>>8;
\r
5301 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
\r
5302 return NormalizerImpl.getFCD16(ch)&0xff;
\r
5303 case UProperty.GRAPHEME_CLUSTER_BREAK:
\r
5304 return (int)(PROPERTY_.getAdditional(ch, 2)& GCB_MASK)>>GCB_SHIFT;
\r
5305 case UProperty.SENTENCE_BREAK:
\r
5306 return (int)(PROPERTY_.getAdditional(ch, 2)& SB_MASK)>>SB_SHIFT;
\r
5307 case UProperty.WORD_BREAK:
\r
5308 return (int)(PROPERTY_.getAdditional(ch, 2)& WB_MASK)>>WB_SHIFT;
\r
5311 return 0; /* undefined */
\r
5313 } else if (type == UProperty.GENERAL_CATEGORY_MASK) {
\r
5314 return UCharacterProperty.getMask(getType(ch));
\r
5316 return 0; // undefined
\r
5319 * Returns a string version of the property value.
\r
5320 * @param propertyEnum
\r
5321 * @param codepoint
\r
5322 * @param nameChoice
\r
5323 * @return value as string
\r
5325 * @deprecated This API is ICU internal only.
\r
5327 public static String getStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) {
\r
5328 // TODO some of these are less efficient, since a string is forced!
\r
5329 if ((propertyEnum >= UProperty.BINARY_START && propertyEnum < UProperty.BINARY_LIMIT) ||
\r
5330 (propertyEnum >= UProperty.INT_START && propertyEnum < UProperty.INT_LIMIT)) {
\r
5331 return getPropertyValueName(propertyEnum, getIntPropertyValue(codepoint, propertyEnum), nameChoice);
\r
5333 if (propertyEnum == UProperty.NUMERIC_VALUE) {
\r
5334 return String.valueOf(getUnicodeNumericValue(codepoint));
\r
5336 // otherwise must be string property
\r
5337 switch (propertyEnum) {
\r
5338 case UProperty.AGE: return getAge(codepoint).toString();
\r
5339 case UProperty.ISO_COMMENT: return getISOComment(codepoint);
\r
5340 case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(getMirror(codepoint));
\r
5341 case UProperty.CASE_FOLDING: return foldCase(UTF16.valueOf(codepoint), true);
\r
5342 case UProperty.LOWERCASE_MAPPING: return toLowerCase(UTF16.valueOf(codepoint));
\r
5343 case UProperty.NAME: return getName(codepoint);
\r
5344 case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(foldCase(codepoint,true));
\r
5345 case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(toLowerCase(codepoint));
\r
5346 case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(toTitleCase(codepoint));
\r
5347 case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(toUpperCase(codepoint));
\r
5348 case UProperty.TITLECASE_MAPPING: return toTitleCase(UTF16.valueOf(codepoint),null);
\r
5349 case UProperty.UNICODE_1_NAME: return getName1_0(codepoint);
\r
5350 case UProperty.UPPERCASE_MAPPING: return toUpperCase(UTF16.valueOf(codepoint));
\r
5352 throw new IllegalArgumentException("Illegal Property Enum");
\r
5356 * Get the minimum value for an integer/binary Unicode property type.
\r
5357 * Can be used together with UCharacter.getIntPropertyMaxValue(int)
\r
5358 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
\r
5359 * @param type UProperty selector constant, identifies which binary
\r
5360 * property to check. Must be
\r
5361 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5362 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
\r
5363 * @return Minimum value returned by UCharacter.getIntPropertyValue(int)
\r
5364 * for a Unicode property. 0 if the property
\r
5365 * selector 'type' is out of range.
\r
5367 * @see #hasBinaryProperty
\r
5368 * @see #getUnicodeVersion
\r
5369 * @see #getIntPropertyMaxValue
\r
5370 * @see #getIntPropertyValue
\r
5373 public static int getIntPropertyMinValue(int type)
\r
5376 return 0; // undefined; and: all other properties have a minimum value
\r
5382 * Get the maximum value for an integer/binary Unicode property.
\r
5383 * Can be used together with UCharacter.getIntPropertyMinValue(int)
\r
5384 * to allocate arrays of com.ibm.icu.text.UnicodeSet or similar.
\r
5385 * Examples for min/max values (for Unicode 3.2):
\r
5387 * <li> UProperty.BIDI_CLASS: 0/18 (UCharacterDirection.LEFT_TO_RIGHT/UCharacterDirection.BOUNDARY_NEUTRAL)
\r
5388 * <li> UProperty.SCRIPT: 0/45 (UScript.COMMON/UScript.TAGBANWA)
\r
5389 * <li> UProperty.IDEOGRAPHIC: 0/1 (false/true)
\r
5391 * For undefined UProperty constant values, min/max values will be 0/-1.
\r
5392 * @param type UProperty selector constant, identifies which binary
\r
5393 * property to check. Must be
\r
5394 * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
\r
5395 * UProperty.INT_START <= type < UProperty.INT_LIMIT.
\r
5396 * @return Maximum value returned by u_getIntPropertyValue for a Unicode
\r
5397 * property. <= 0 if the property selector 'type' is out of range.
\r
5399 * @see #hasBinaryProperty
\r
5400 * @see #getUnicodeVersion
\r
5401 * @see #getIntPropertyMaxValue
\r
5402 * @see #getIntPropertyValue
\r
5405 public static int getIntPropertyMaxValue(int type)
\r
5407 if (type < UProperty.BINARY_START) {
\r
5408 return -1; // undefined
\r
5410 else if (type < UProperty.BINARY_LIMIT) {
\r
5411 return 1; // maximum TRUE for all binary properties
\r
5413 else if (type < UProperty.INT_START) {
\r
5414 return -1; // undefined
\r
5416 else if (type < UProperty.INT_LIMIT) {
\r
5418 case UProperty.BIDI_CLASS:
\r
5419 case UProperty.JOINING_GROUP:
\r
5420 case UProperty.JOINING_TYPE:
\r
5421 return gBdp.getMaxValue(type);
\r
5422 case UProperty.BLOCK:
\r
5423 return (PROPERTY_.getMaxValues(0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
\r
5424 case UProperty.CANONICAL_COMBINING_CLASS:
\r
5425 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
\r
5426 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
\r
5427 return 0xff; // TODO do we need to be more precise,
\r
5428 // getting the actual maximum?
\r
5429 case UProperty.DECOMPOSITION_TYPE:
\r
5430 return PROPERTY_.getMaxValues(2) & DECOMPOSITION_TYPE_MASK_;
\r
5431 case UProperty.EAST_ASIAN_WIDTH:
\r
5432 return (PROPERTY_.getMaxValues(0) & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
\r
5433 case UProperty.GENERAL_CATEGORY:
\r
5434 return UCharacterCategory.CHAR_CATEGORY_COUNT - 1;
\r
5435 case UProperty.LINE_BREAK:
\r
5436 return (PROPERTY_.getMaxValues(LB_VWORD) & LB_MASK) >> LB_SHIFT;
\r
5437 case UProperty.NUMERIC_TYPE:
\r
5438 return NumericType.COUNT - 1;
\r
5439 case UProperty.SCRIPT:
\r
5440 return PROPERTY_.getMaxValues(0) & SCRIPT_MASK_;
\r
5441 case UProperty.HANGUL_SYLLABLE_TYPE:
\r
5442 return HangulSyllableType.COUNT-1;
\r
5443 case UProperty.NFD_QUICK_CHECK:
\r
5444 case UProperty.NFKD_QUICK_CHECK:
\r
5445 return 1; // YES -- these are never "maybe", only "no" or "yes"
\r
5446 case UProperty.NFC_QUICK_CHECK:
\r
5447 case UProperty.NFKC_QUICK_CHECK:
\r
5448 return 2; // MAYBE
\r
5449 case UProperty.GRAPHEME_CLUSTER_BREAK:
\r
5450 return (PROPERTY_.getMaxValues(2) & GCB_MASK) >> GCB_SHIFT;
\r
5451 case UProperty.SENTENCE_BREAK:
\r
5452 return (PROPERTY_.getMaxValues(2) & SB_MASK) >> SB_SHIFT;
\r
5453 case UProperty.WORD_BREAK:
\r
5454 return (PROPERTY_.getMaxValues(2) & WB_MASK) >> WB_SHIFT;
\r
5456 return -1; // undefined
\r
5460 return -1; // undefined
\r
5464 * Provide the java.lang.Character forDigit API, for convenience.
\r
5467 public static char forDigit(int digit, int radix) {
\r
5468 return java.lang.Character.forDigit(digit, radix);
\r
5471 // JDK 1.5 API coverage
\r
5474 * Cover the JDK 1.5 API, for convenience.
\r
5475 * @see UTF16#LEAD_SURROGATE_MIN_VALUE
\r
5478 public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
\r
5481 * Cover the JDK 1.5 API, for convenience.
\r
5482 * @see UTF16#LEAD_SURROGATE_MAX_VALUE
\r
5485 public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
\r
5488 * Cover the JDK 1.5 API, for convenience.
\r
5489 * @see UTF16#TRAIL_SURROGATE_MIN_VALUE
\r
5492 public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
\r
5495 * Cover the JDK 1.5 API, for convenience.
\r
5496 * @see UTF16#TRAIL_SURROGATE_MAX_VALUE
\r
5499 public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
\r
5502 * Cover the JDK 1.5 API, for convenience.
\r
5503 * @see UTF16#SURROGATE_MIN_VALUE
\r
5506 public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
\r
5509 * Cover the JDK 1.5 API, for convenience.
\r
5510 * @see UTF16#SURROGATE_MAX_VALUE
\r
5513 public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
\r
5516 * Cover the JDK 1.5 API, for convenience.
\r
5517 * @see UTF16#SUPPLEMENTARY_MIN_VALUE
\r
5520 public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
\r
5523 * Cover the JDK 1.5 API, for convenience.
\r
5524 * @see UTF16#CODEPOINT_MAX_VALUE
\r
5527 public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
\r
5530 * Cover the JDK 1.5 API, for convenience.
\r
5531 * @see UTF16#CODEPOINT_MIN_VALUE
\r
5534 public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
\r
5537 * Cover the JDK 1.5 API, for convenience.
\r
5538 * @param cp the code point to check
\r
5539 * @return true if cp is a valid code point
\r
5542 public static final boolean isValidCodePoint(int cp) {
\r
5543 return cp >= 0 && cp <= MAX_CODE_POINT;
\r
5547 * Cover the JDK 1.5 API, for convenience.
\r
5548 * @param cp the code point to check
\r
5549 * @return true if cp is a supplementary code point
\r
5552 public static final boolean isSupplementaryCodePoint(int cp) {
\r
5553 return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
\r
5554 && cp <= UTF16.CODEPOINT_MAX_VALUE;
\r
5558 * Cover the JDK 1.5 API, for convenience.
\r
5559 * @param ch the char to check
\r
5560 * @return true if ch is a high (lead) surrogate
\r
5563 public static boolean isHighSurrogate(char ch) {
\r
5564 return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
\r
5568 * Cover the JDK 1.5 API, for convenience.
\r
5569 * @param ch the char to check
\r
5570 * @return true if ch is a low (trail) surrogate
\r
5573 public static boolean isLowSurrogate(char ch) {
\r
5574 return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
\r
5578 * Cover the JDK 1.5 API, for convenience. Return true if the chars
\r
5579 * form a valid surrogate pair.
\r
5580 * @param high the high (lead) char
\r
5581 * @param low the low (trail) char
\r
5582 * @return true if high, low form a surrogate pair
\r
5585 public static final boolean isSurrogatePair(char high, char low) {
\r
5586 return isHighSurrogate(high) && isLowSurrogate(low);
\r
5590 * Cover the JDK 1.5 API, for convenience. Return the number of chars needed
\r
5591 * to represent the code point. This does not check the
\r
5592 * code point for validity.
\r
5593 * @param cp the code point to check
\r
5594 * @return the number of chars needed to represent the code point
\r
5595 * @see UTF16#getCharCount
\r
5598 public static int charCount(int cp) {
\r
5599 return UTF16.getCharCount(cp);
\r
5603 * Cover the JDK 1.5 API, for convenience. Return the code point represented by
\r
5604 * the characters. This does not check the surrogate pair for validity.
\r
5605 * @param high the high (lead) surrogate
\r
5606 * @param low the low (trail) surrogate
\r
5607 * @return the code point formed by the surrogate pair
\r
5610 public static final int toCodePoint(char high, char low) {
\r
5611 return UCharacterProperty.getRawSupplementary(high, low);
\r
5615 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5616 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5617 * API. This examines only the characters at index and index+1.
\r
5618 * @param seq the characters to check
\r
5619 * @param index the index of the first or only char forming the code point
\r
5620 * @return the code point at the index
\r
5623 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5624 //## public static final int codePointAt(String seq, int index) {
\r
5625 //## char c1 = seq.charAt(index++);
\r
5626 //## if (isHighSurrogate(c1)) {
\r
5627 //## if (index < seq.length()) {
\r
5628 //## char c2 = seq.charAt(index);
\r
5629 //## if (isLowSurrogate(c2)) {
\r
5630 //## return toCodePoint(c1, c2);
\r
5636 //## public static final int codePointAt(StringBuffer seq, int index) {
\r
5637 //## return codePointAt(seq.toString(), index);
\r
5640 //#if defined(ECLIPSE_FRAGMENT)
\r
5641 //## public static final int codePointAt(String seq, int index) {
\r
5642 //## return codePointAt((CharSequence)seq, index);
\r
5644 //## public static final int codePointAt(StringBuffer seq, int index) {
\r
5645 //## return codePointAt((CharSequence)seq, index);
\r
5648 public static final int codePointAt(CharSequence seq, int index) {
\r
5649 char c1 = seq.charAt(index++);
\r
5650 if (isHighSurrogate(c1)) {
\r
5651 if (index < seq.length()) {
\r
5652 char c2 = seq.charAt(index);
\r
5653 if (isLowSurrogate(c2)) {
\r
5654 return toCodePoint(c1, c2);
\r
5663 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5664 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5665 * API. This examines only the characters at index and index+1.
\r
5666 * @param text the characters to check
\r
5667 * @param index the index of the first or only char forming the code point
\r
5668 * @return the code point at the index
\r
5671 public static final int codePointAt(char[] text, int index) {
\r
5672 char c1 = text[index++];
\r
5673 if (isHighSurrogate(c1)) {
\r
5674 if (index < text.length) {
\r
5675 char c2 = text[index];
\r
5676 if (isLowSurrogate(c2)) {
\r
5677 return toCodePoint(c1, c2);
\r
5685 * Cover the JDK 1.5 API, for convenience. Return the code point at index.
\r
5686 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5687 * API. This examines only the characters at index and index+1.
\r
5688 * @param text the characters to check
\r
5689 * @param index the index of the first or only char forming the code point
\r
5690 * @param limit the limit of the valid text
\r
5691 * @return the code point at the index
\r
5694 public static final int codePointAt(char[] text, int index, int limit) {
\r
5695 if (index >= limit || limit > text.length) {
\r
5696 throw new IndexOutOfBoundsException();
\r
5698 char c1 = text[index++];
\r
5699 if (isHighSurrogate(c1)) {
\r
5700 if (index < limit) {
\r
5701 char c2 = text[index];
\r
5702 if (isLowSurrogate(c2)) {
\r
5703 return toCodePoint(c1, c2);
\r
5711 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5712 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5713 * API. This examines only the characters at index-1 and index-2.
\r
5714 * @param seq the characters to check
\r
5715 * @param index the index after the last or only char forming the code point
\r
5716 * @return the code point before the index
\r
5719 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5720 //## public static final int codePointBefore(String seq, int index) {
\r
5721 //## char c2 = seq.charAt(--index);
\r
5722 //## if (isLowSurrogate(c2)) {
\r
5723 //## if (index > 0) {
\r
5724 //## char c1 = seq.charAt(--index);
\r
5725 //## if (isHighSurrogate(c1)) {
\r
5726 //## return toCodePoint(c1, c2);
\r
5732 //## public static final int codePointBefore(StringBuffer seq, int index) {
\r
5733 //## return codePointBefore(seq.toString(), index);
\r
5736 //#if defined(ECLIPSE_FRAGMENT)
\r
5737 //## public static final int codePointBefore(String seq, int index) {
\r
5738 //## return codePointBefore((CharSequence)seq, index);
\r
5740 //## public static final int codePointBefore(StringBuffer seq, int index) {
\r
5741 //## return codePointBefore((CharSequence)seq, index);
\r
5744 public static final int codePointBefore(CharSequence seq, int index) {
\r
5745 char c2 = seq.charAt(--index);
\r
5746 if (isLowSurrogate(c2)) {
\r
5748 char c1 = seq.charAt(--index);
\r
5749 if (isHighSurrogate(c1)) {
\r
5750 return toCodePoint(c1, c2);
\r
5759 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5760 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5761 * API. This examines only the characters at index-1 and index-2.
\r
5762 * @param text the characters to check
\r
5763 * @param index the index after the last or only char forming the code point
\r
5764 * @return the code point before the index
\r
5767 public static final int codePointBefore(char[] text, int index) {
\r
5768 char c2 = text[--index];
\r
5769 if (isLowSurrogate(c2)) {
\r
5771 char c1 = text[--index];
\r
5772 if (isHighSurrogate(c1)) {
\r
5773 return toCodePoint(c1, c2);
\r
5781 * Cover the JDK 1.5 API, for convenience. Return the code point before index.
\r
5782 * <br/><b>Note</b>: the semantics of this API is different from the related UTF16
\r
5783 * API. This examines only the characters at index-1 and index-2.
\r
5784 * @param text the characters to check
\r
5785 * @param index the index after the last or only char forming the code point
\r
5786 * @param limit the start of the valid text
\r
5787 * @return the code point before the index
\r
5790 public static final int codePointBefore(char[] text, int index, int limit) {
\r
5791 if (index <= limit || limit < 0) {
\r
5792 throw new IndexOutOfBoundsException();
\r
5794 char c2 = text[--index];
\r
5795 if (isLowSurrogate(c2)) {
\r
5796 if (index > limit) {
\r
5797 char c1 = text[--index];
\r
5798 if (isHighSurrogate(c1)) {
\r
5799 return toCodePoint(c1, c2);
\r
5807 * Cover the JDK 1.5 API, for convenience. Writes the chars representing the
\r
5808 * code point into the destination at the given index.
\r
5809 * @param cp the code point to convert
\r
5810 * @param dst the destination array into which to put the char(s) representing the code point
\r
5811 * @param dstIndex the index at which to put the first (or only) char
\r
5812 * @return the count of the number of chars written (1 or 2)
\r
5813 * @throws IllegalArgumentException if cp is not a valid code point
\r
5816 public static final int toChars(int cp, char[] dst, int dstIndex) {
\r
5818 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
\r
5819 dst[dstIndex] = (char)cp;
\r
5822 if (cp <= MAX_CODE_POINT) {
\r
5823 dst[dstIndex] = UTF16.getLeadSurrogate(cp);
\r
5824 dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
\r
5828 throw new IllegalArgumentException();
\r
5832 * Cover the JDK 1.5 API, for convenience. Returns a char array
\r
5833 * representing the code point.
\r
5834 * @param cp the code point to convert
\r
5835 * @return an array containing the char(s) representing the code point
\r
5836 * @throws IllegalArgumentException if cp is not a valid code point
\r
5839 public static final char[] toChars(int cp) {
\r
5841 if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
\r
5842 return new char[] { (char)cp };
\r
5844 if (cp <= MAX_CODE_POINT) {
\r
5845 return new char[] {
\r
5846 UTF16.getLeadSurrogate(cp),
\r
5847 UTF16.getTrailSurrogate(cp)
\r
5851 throw new IllegalArgumentException();
\r
5855 * Cover the JDK API, for convenience. Return a byte representing the directionality of
\r
5857 * <br/><b>Note</b>: Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined or
\r
5858 * out-of-bounds characters. <br/><b>Note</b>: The return value must be
\r
5859 * tested using the constants defined in {@link UCharacterEnums.ECharacterDirection}
\r
5860 * since the values are different from the ones defined by <code>java.lang.Character</code>.
\r
5861 * @param cp the code point to check
\r
5862 * @return the directionality of the code point
\r
5863 * @see #getDirection
\r
5866 public static byte getDirectionality(int cp)
\r
5868 return (byte)getDirection(cp);
\r
5872 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
\r
5873 * @param text the characters to check
\r
5874 * @param start the start of the range
\r
5875 * @param limit the limit of the range
\r
5876 * @return the number of code points in the range
\r
5879 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5880 //## public static int codePointCount(String text, int start, int limit) {
\r
5881 //## if (start < 0 || limit < start || limit > text.length()) {
\r
5882 //## throw new IndexOutOfBoundsException("start (" + start +
\r
5883 //## ") or limit (" + limit +
\r
5884 //## ") invalid or out of range 0, " + text.length());
\r
5887 //## int len = limit - start;
\r
5888 //## while (limit > start) {
\r
5889 //## char ch = text.charAt(--limit);
\r
5890 //## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
\r
5891 //## ch = text.charAt(--limit);
\r
5892 //## if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
\r
5900 //## public static int codePointCount(StringBuffer text, int start, int limit) {
\r
5901 //## return codePointCount(text.toString(), start, limit);
\r
5904 //#if defined(ECLIPSE_FRAGMENT)
\r
5905 //## public static int codePointCount(String text, int start, int limit) {
\r
5906 //## return codePointCount((CharSequence)text, start, limit);
\r
5908 //## public static int codePointCount(StringBuffer text, int start, int limit) {
\r
5909 //## return codePointCount((CharSequence)text, start, limit);
\r
5912 public static int codePointCount(CharSequence text, int start, int limit) {
\r
5913 if (start < 0 || limit < start || limit > text.length()) {
\r
5914 throw new IndexOutOfBoundsException("start (" + start +
\r
5915 ") or limit (" + limit +
\r
5916 ") invalid or out of range 0, " + text.length());
\r
5919 int len = limit - start;
\r
5920 while (limit > start) {
\r
5921 char ch = text.charAt(--limit);
\r
5922 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
\r
5923 ch = text.charAt(--limit);
\r
5924 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
\r
5935 * Cover the JDK API, for convenience. Count the number of code points in the range of text.
\r
5936 * @param text the characters to check
\r
5937 * @param start the start of the range
\r
5938 * @param limit the limit of the range
\r
5939 * @return the number of code points in the range
\r
5942 public static int codePointCount(char[] text, int start, int limit) {
\r
5943 if (start < 0 || limit < start || limit > text.length) {
\r
5944 throw new IndexOutOfBoundsException("start (" + start +
\r
5945 ") or limit (" + limit +
\r
5946 ") invalid or out of range 0, " + text.length);
\r
5949 int len = limit - start;
\r
5950 while (limit > start) {
\r
5951 char ch = text[--limit];
\r
5952 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
\r
5953 ch = text[--limit];
\r
5954 if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
\r
5964 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
\r
5965 * @param text the characters to check
\r
5966 * @param index the index to adjust
\r
5967 * @param codePointOffset the number of code points by which to offset the index
\r
5968 * @return the adjusted index
\r
5971 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
5972 //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
\r
5973 //## if (index < 0 || index > text.length()) {
\r
5974 //## throw new IndexOutOfBoundsException("index ( " + index +
\r
5975 //## ") out of range 0, " + text.length());
\r
5978 //## if (codePointOffset < 0) {
\r
5979 //## while (++codePointOffset <= 0) {
\r
5980 //## char ch = text.charAt(--index);
\r
5981 //## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
\r
5982 //## ch = text.charAt(--index);
\r
5983 //## if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
\r
5984 //## if (++codePointOffset > 0) {
\r
5985 //## return index+1;
\r
5991 //## int limit = text.length();
\r
5992 //## while (--codePointOffset >= 0) {
\r
5993 //## char ch = text.charAt(index++);
\r
5994 //## while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
\r
5995 //## ch = text.charAt(index++);
\r
5996 //## if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
\r
5997 //## if (--codePointOffset < 0) {
\r
5998 //## return index-1;
\r
6005 //## return index;
\r
6007 //## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
\r
6008 //## return offsetByCodePoints(text.toString(), index, codePointOffset);
\r
6011 //#if defined(ECLIPSE_FRAGMENT)
\r
6012 //## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
\r
6013 //## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
\r
6015 //## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
\r
6016 //## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
\r
6019 public static int offsetByCodePoints(CharSequence text, int index, int codePointOffset) {
\r
6020 if (index < 0 || index > text.length()) {
\r
6021 throw new IndexOutOfBoundsException("index ( " + index +
\r
6022 ") out of range 0, " + text.length());
\r
6025 if (codePointOffset < 0) {
\r
6026 while (++codePointOffset <= 0) {
\r
6027 char ch = text.charAt(--index);
\r
6028 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
\r
6029 ch = text.charAt(--index);
\r
6030 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
\r
6031 if (++codePointOffset > 0) {
\r
6038 int limit = text.length();
\r
6039 while (--codePointOffset >= 0) {
\r
6040 char ch = text.charAt(index++);
\r
6041 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
\r
6042 ch = text.charAt(index++);
\r
6043 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
\r
6044 if (--codePointOffset < 0) {
\r
6057 * Cover the JDK API, for convenience. Adjust the char index by a code point offset.
\r
6058 * @param text the characters to check
\r
6059 * @param start the start of the range to check
\r
6060 * @param count the length of the range to check
\r
6061 * @param index the index to adjust
\r
6062 * @param codePointOffset the number of code points by which to offset the index
\r
6063 * @return the adjusted index
\r
6066 public static int offsetByCodePoints(char[] text, int start, int count, int index, int codePointOffset) {
\r
6067 int limit = start + count;
\r
6068 if (start < 0 || limit < start || limit > text.length || index < start || index > limit) {
\r
6069 throw new IndexOutOfBoundsException("index ( " + index +
\r
6070 ") out of range " + start +
\r
6072 " in array 0, " + text.length);
\r
6075 if (codePointOffset < 0) {
\r
6076 while (++codePointOffset <= 0) {
\r
6077 char ch = text[--index];
\r
6078 if (index < start) {
\r
6079 throw new IndexOutOfBoundsException("index ( " + index +
\r
6080 ") < start (" + start +
\r
6083 while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > start) {
\r
6084 ch = text[--index];
\r
6085 if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
\r
6086 if (++codePointOffset > 0) {
\r
6093 while (--codePointOffset >= 0) {
\r
6094 char ch = text[index++];
\r
6095 if (index > limit) {
\r
6096 throw new IndexOutOfBoundsException("index ( " + index +
\r
6097 ") > limit (" + limit +
\r
6100 while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
\r
6101 ch = text[index++];
\r
6102 if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
\r
6103 if (--codePointOffset < 0) {
\r
6114 // protected data members --------------------------------------------
\r
6117 * Database storing the sets of character name
\r
6119 static UCharacterName NAME_ = null;
\r
6122 * Singleton object encapsulating the imported pnames.icu property aliases
\r
6124 static UPropertyAliases PNAMES_ = null;
\r
6126 // block to initialise name database and unicode 1.0 data
\r
6129 PNAMES_ = new UPropertyAliases();
\r
6130 NAME_ = UCharacterName.getInstance();
\r
6131 } catch (IOException e) {
\r
6132 // e.printStackTrace();
\r
6133 throw new MissingResourceException(e.getMessage(),"","");
\r
6134 //throw new RuntimeException(e.getMessage());
\r
6135 // DONOT throw an exception
\r
6136 // we might be building ICU modularly wothout names.icu and
\r
6141 // private variables -------------------------------------------------
\r
6144 * Database storing the sets of character property
\r
6146 private static final UCharacterProperty PROPERTY_;
\r
6148 * For optimization
\r
6150 private static final char[] PROPERTY_TRIE_INDEX_;
\r
6151 private static final char[] PROPERTY_TRIE_DATA_;
\r
6152 private static final int PROPERTY_INITIAL_VALUE_;
\r
6154 private static final UCaseProps gCsp;
\r
6155 private static final UBiDiProps gBdp;
\r
6157 // block to initialise character property database
\r
6162 PROPERTY_ = UCharacterProperty.getInstance();
\r
6163 PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
\r
6164 PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
\r
6165 PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
\r
6167 catch (Exception e)
\r
6169 throw new MissingResourceException(e.getMessage(),"","");
\r
6173 * In ICU4J 3.2, most Unicode properties were loaded from uprops.icu.
\r
6174 * ICU4J 3.4 adds ucase.icu for case mapping properties and
\r
6175 * ubidi.icu for bidi/shaping properties and
\r
6176 * removes case/bidi/shaping properties from uprops.icu.
\r
6178 * Loading of uprops.icu was always done during class loading of UCharacter.class.
\r
6179 * In order to maintain performance for all such properties,
\r
6180 * ucase.icu and ubidi.icu are also loaded during class loading of UCharacter.class.
\r
6181 * It will not fail if they are missing.
\r
6182 * These data items are loaded early to avoid having to synchronize access to them,
\r
6183 * for thread safety and performance.
\r
6185 * We try to load these data items at most once.
\r
6186 * If it works, we use the resulting singleton object.
\r
6187 * If it fails, then we get a dummy object, which always works unless
\r
6188 * we are seriously out of memory.
\r
6189 * After UCharacter.class loading, we have a never-changing pointer to either the
\r
6190 * real singleton or the dummy.
\r
6192 * This method is used in Unicode properties APIs that
\r
6193 * do not have a service object and also do not have an error code parameter.
\r
6194 * Other API implementations get the singleton themselves
\r
6195 * (synchronized), store it in the service object, and report errors.
\r
6199 csp=UCaseProps.getSingleton();
\r
6200 } catch(IOException e) {
\r
6201 csp=UCaseProps.getDummy();
\r
6207 bdp=UBiDiProps.getSingleton();
\r
6208 } catch(IOException e) {
\r
6209 bdp=UBiDiProps.getDummy();
\r
6215 * To get the last character out from a data type
\r
6217 private static final int LAST_CHAR_MASK_ = 0xFFFF;
\r
6220 // * To get the last byte out from a data type
\r
6222 // private static final int LAST_BYTE_MASK_ = 0xFF;
\r
6225 // * Shift 16 bits
\r
6227 // private static final int SHIFT_16_ = 16;
\r
6230 // * Shift 24 bits
\r
6232 // private static final int SHIFT_24_ = 24;
\r
6235 // * Decimal radix
\r
6237 // private static final int DECIMAL_RADIX_ = 10;
\r
6240 * No break space code point
\r
6242 private static final int NO_BREAK_SPACE_ = 0xA0;
\r
6245 * Figure space code point
\r
6247 private static final int FIGURE_SPACE_ = 0x2007;
\r
6250 * Narrow no break space code point
\r
6252 private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
\r
6255 * Ideographic number zero code point
\r
6257 private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
\r
6260 * CJK Ideograph, First code point
\r
6262 private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
\r
6265 * CJK Ideograph, Second code point
\r
6267 private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
\r
6270 * CJK Ideograph, Third code point
\r
6272 private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
\r
6275 * CJK Ideograph, Fourth code point
\r
6277 private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
\r
6280 * CJK Ideograph, FIFTH code point
\r
6282 private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
\r
6285 * CJK Ideograph, Sixth code point
\r
6287 private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
\r
6290 * CJK Ideograph, Seventh code point
\r
6292 private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
\r
6295 * CJK Ideograph, Eighth code point
\r
6297 private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
\r
6300 * CJK Ideograph, Nineth code point
\r
6302 private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
\r
6305 * Application Program command code point
\r
6307 private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
\r
6310 * Unit separator code point
\r
6312 private static final int UNIT_SEPARATOR_ = 0x001F;
\r
6315 * Delete code point
\r
6317 private static final int DELETE_ = 0x007F;
\r
6319 * ISO control character first range upper limit 0x0 - 0x1F
\r
6321 //private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;
\r
6323 * Shift to get numeric type
\r
6325 private static final int NUMERIC_TYPE_SHIFT_ = 5;
\r
6327 * Mask to get numeric type
\r
6329 private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
\r
6331 /* encoding of fractional and large numbers */
\r
6332 //private static final int MAX_SMALL_NUMBER=0xff;
\r
6334 private static final int FRACTION_NUM_SHIFT=3; /* numerator: bits 7..3 */
\r
6335 private static final int FRACTION_DEN_MASK=7; /* denominator: bits 2..0 */
\r
6337 //private static final int FRACTION_MAX_NUM=31;
\r
6338 private static final int FRACTION_DEN_OFFSET=2; /* denominator values are 2..9 */
\r
6340 //private static final int FRACTION_MIN_DEN=FRACTION_DEN_OFFSET;
\r
6341 //private static final int FRACTION_MAX_DEN=FRACTION_MIN_DEN+FRACTION_DEN_MASK;
\r
6343 private static final int LARGE_MANT_SHIFT=4; /* mantissa: bits 7..4 */
\r
6344 private static final int LARGE_EXP_MASK=0xf; /* exponent: bits 3..0 */
\r
6345 private static final int LARGE_EXP_OFFSET=2; /* regular exponents 2..17 */
\r
6346 private static final int LARGE_EXP_OFFSET_EXTRA=18; /* extra large exponents 18..33 */
\r
6348 //private static final int LARGE_MIN_EXP=LARGE_EXP_OFFSET;
\r
6349 //private static final int LARGE_MAX_EXP=LARGE_MIN_EXP+LARGE_EXP_MASK;
\r
6350 //private static final int LARGE_MAX_EXP_EXTRA=LARGE_EXP_OFFSET_EXTRA+LARGE_EXP_MASK;
\r
6353 * Han digit characters
\r
6355 private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
\r
6356 private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
\r
6357 private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
\r
6358 private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
\r
6359 private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
\r
6360 private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
\r
6361 private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
\r
6362 private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
\r
6363 private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
\r
6364 private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
\r
6365 private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
\r
6366 private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
\r
6367 private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
\r
6368 private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
\r
6369 private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
\r
6370 private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
\r
6371 private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
\r
6372 private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
\r
6375 // * Zero Width Non Joiner.
\r
6376 // * Equivalent to icu4c ZWNJ.
\r
6378 // private static final int ZERO_WIDTH_NON_JOINER_ = 0x200c;
\r
6380 // * Zero Width Joiner
\r
6381 // * Equivalent to icu4c ZWJ.
\r
6383 // private static final int ZERO_WIDTH_JOINER_ = 0x200d;
\r
6386 * Properties in vector word 2
\r
6389 * 25..20 Line Break
\r
6390 * 19..15 Sentence Break
\r
6391 * 14..10 Word Break
\r
6392 * 9.. 5 Grapheme Cluster Break
\r
6393 * 4.. 0 Decomposition Type
\r
6395 private static final int LB_MASK = 0x03f00000;
\r
6396 private static final int LB_SHIFT = 20;
\r
6397 private static final int LB_VWORD = 2;
\r
6399 private static final int SB_MASK = 0x000f8000;
\r
6400 private static final int SB_SHIFT = 15;
\r
6402 private static final int WB_MASK = 0x00007c00;
\r
6403 private static final int WB_SHIFT = 10;
\r
6405 private static final int GCB_MASK = 0x000003e0;
\r
6406 private static final int GCB_SHIFT = 5;
\r
6409 * Integer properties mask for decomposition type.
\r
6410 * Equivalent to icu4c UPROPS_DT_MASK.
\r
6412 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
\r
6415 * Properties in vector word 0
\r
6417 * 31..24 DerivedAge version major/minor one nibble each
\r
6419 * 19..17 East Asian Width
\r
6420 * 16.. 8 UBlockCode
\r
6421 * 7.. 0 UScriptCode
\r
6425 * Integer properties mask and shift values for East Asian cell width.
\r
6426 * Equivalent to icu4c UPROPS_EA_MASK
\r
6428 private static final int EAST_ASIAN_MASK_ = 0x000e0000;
\r
6430 * Integer properties mask and shift values for East Asian cell width.
\r
6431 * Equivalent to icu4c UPROPS_EA_SHIFT
\r
6433 private static final int EAST_ASIAN_SHIFT_ = 17;
\r
6435 * Integer properties mask and shift values for blocks.
\r
6436 * Equivalent to icu4c UPROPS_BLOCK_MASK
\r
6438 private static final int BLOCK_MASK_ = 0x0001ff00;
\r
6440 * Integer properties mask and shift values for blocks.
\r
6441 * Equivalent to icu4c UPROPS_BLOCK_SHIFT
\r
6443 private static final int BLOCK_SHIFT_ = 8;
\r
6445 * Integer properties mask and shift values for scripts.
\r
6446 * Equivalent to icu4c UPROPS_SHIFT_MASK
\r
6448 private static final int SCRIPT_MASK_ = 0x000000ff;
\r
6450 // private constructor -----------------------------------------------
\r
6453 * Private constructor to prevent instantiation
\r
6455 private UCharacter()
\r
6459 // private methods ---------------------------------------------------
\r
6462 * Getting the digit values of characters like 'A' - 'Z', normal,
\r
6463 * half-width and full-width. This method assumes that the other digit
\r
6464 * characters are checked by the calling method.
\r
6465 * @param ch character to test
\r
6466 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
\r
6467 * its corresponding digit will be returned.
\r
6469 private static int getEuropeanDigit(int ch) {
\r
6470 if ((ch > 0x7a && ch < 0xff21)
\r
6471 || ch < 0x41 || (ch > 0x5a && ch < 0x61)
\r
6472 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
\r
6476 // ch >= 0x41 or ch < 0x61
\r
6477 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
\r
6480 if (ch <= 0xff3a) {
\r
6481 return ch + 10 - 0xff21;
\r
6483 // ch >= 0xff41 && ch <= 0xff5a
\r
6484 return ch + 10 - 0xff41;
\r
6488 * Gets the numeric type of the property argument
\r
6489 * @param props 32 bit property
\r
6490 * @return the numeric type
\r
6492 private static int getNumericType(int props)
\r
6494 return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
\r
6498 * Gets the property value at the index.
\r
6499 * This is optimized.
\r
6500 * Note this is alittle different from CharTrie the index m_trieData_
\r
6501 * is never negative.
\r
6502 * This is a duplicate of UCharacterProperty.getProperty. For optimization
\r
6503 * purposes, this method calls the trie data directly instead of through
\r
6504 * UCharacterProperty.getProperty.
\r
6505 * @param ch code point whose property value is to be retrieved
\r
6506 * @return property value of code point
\r
6509 private static final int getProperty(int ch)
\r
6511 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
\r
6512 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
\r
6513 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
\r
6514 // BMP codepoint 0000..D7FF or DC00..FFFF
\r
6515 try { // using try for ch < 0 is faster than using an if statement
\r
6516 return PROPERTY_TRIE_DATA_[
\r
6517 (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
\r
6519 } catch (ArrayIndexOutOfBoundsException e) {
\r
6520 return PROPERTY_INITIAL_VALUE_;
\r
6523 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
6524 // lead surrogate D800..DBFF
\r
6525 return PROPERTY_TRIE_DATA_[
\r
6526 (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
\r
6529 // for optimization
\r
6530 if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
\r
6531 // supplementary code point 10000..10FFFF
\r
6532 // look at the construction of supplementary characters
\r
6533 // trail forms the ends of it.
\r
6534 return PROPERTY_.m_trie_.getSurrogateValue(
\r
6535 UTF16.getLeadSurrogate(ch),
\r
6536 (char)(ch & 0x3ff));
\r
6538 // return m_dataOffset_ if there is an error, in this case we return
\r
6539 // the default value: m_initialValue_
\r
6540 // we cannot assume that m_initialValue_ is at offset 0
\r
6541 // this is for optimization.
\r
6542 return PROPERTY_INITIAL_VALUE_;
\r