//##header J2SE15 /** ******************************************************************************* * Copyright (C) 1996-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.lang; import java.io.IOException; import java.lang.ref.SoftReference; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.MissingResourceException; import com.ibm.icu.impl.UBiDiProps; import com.ibm.icu.impl.UCaseProps; import com.ibm.icu.impl.NormalizerImpl; import com.ibm.icu.impl.UCharacterUtility; import com.ibm.icu.impl.UCharacterName; import com.ibm.icu.impl.UCharacterNameChoice; import com.ibm.icu.impl.UPropertyAliases; import com.ibm.icu.lang.UCharacterEnums.*; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.util.RangeValueIterator; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.ValueIterator; import com.ibm.icu.util.VersionInfo; /** *
* The UCharacter class provides extensions to the * * java.lang.Character class. These extensions provide support for * more Unicode properties and together with the UTF16 * class, provide support for supplementary characters (those with code * points above U+FFFF). * Each ICU release supports the latest version of Unicode available at that time. *
** Code points are represented in these API using ints. While it would be * more convenient in Java to have a separate primitive datatype for them, * ints suffice in the meantime. *
*
* To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.
* E.g. In Windows
* set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar
.
* Otherwise, another method would be to copy the files uprops.dat and
* unames.icu from the icu4j source subdirectory
* $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory
* $ICU4J_CLASS/com.ibm.icu.impl.data.
*
* Aside from the additions for UTF-16 support, and the updated Unicode * properties, the main differences between UCharacter and Character are: *
* Further detail differences can be determined from the program * * com.ibm.icu.dev.test.lang.UCharacterCompare *
** In addition to Java compatibility functions, which calculate derived properties, * this API provides low-level access to the Unicode Character Database. *
** Unicode assigns each code point (not just assigned character) values for * many properties. * Most of them are simple boolean flags, or constants from a small enumerated list. * For some properties, values are strings or other relatively more complex types. *
** For more information see * "About the Unicode Character Database" (http://www.unicode.org/ucd/) * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html). *
** There are also functions that provide easy migration from C/POSIX functions * like isblank(). Their use is generally discouraged because the C/POSIX * standards do not define their semantics beyond the ASCII range, which means * that different implementations exhibit very different behavior. * Instead, Unicode properties should be used directly. *
** There are also only a few, broad C/POSIX character classes, and they tend * to be used for conflicting purposes. For example, the "isalpha()" class * is sometimes used to determine word boundaries, while a more sophisticated * approach would at least distinguish initial letters from continuation * characters (the latter including combining marks). * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) * Another example: There is no "istitle()" class for titlecase characters. *
** ICU 3.4 and later provides API access for all twelve C/POSIX character classes. * ICU implements them according to the Standard Recommendations in * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). *
*
* API access for C/POSIX character classes is as follows:
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
* - punct: ((1<
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
*
* Note: There are several ICU (and Java) whitespace functions.
* Comparison:
* - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - isSpaceChar: just Z (including no-break spaces)
*
* This class is not subclassable
* Get the numeric value for a Unicode code point as defined in the
* Unicode Character Database. A "double" return type is necessary because some numeric values are
* fractions, negative, or too large for int. For characters without any numeric values in the Unicode Character
* Database, this function will return NO_NUMERIC_VALUE. API Change: In release 2.2 and prior, this API has a
* return type int and returns -1 when the argument ch does not have a
* corresponding numeric value. This has been changed to synch with ICU4C
*
Note: Unlike JDK 1.5, this only matches
* against the official UCD name and the Java block name
* (ignoring case).
* @param blockName the name of the block to match
* @return the UnicodeBlock with that name
* @throws IllegalArgumentException if the blockName could not be matched
* @stable ICU 3.0
*/
public static final UnicodeBlock forName(String blockName) {
Map m = null;
if (mref != null) {
m = (Map)mref.get();
}
if (m == null) {
m = new HashMap(BLOCKS_.length);
for (int i = 0; i < BLOCKS_.length; ++i) {
UnicodeBlock b = BLOCKS_[i];
String name = trimBlockName(getPropertyValueName(UProperty.BLOCK, b.getID(), UProperty.NameChoice.LONG));
m.put(name, b);
}
mref = new SoftReference(m);
}
UnicodeBlock b = (UnicodeBlock)m.get(trimBlockName(blockName));
if (b == null) {
throw new IllegalArgumentException();
}
return b;
}
private static SoftReference mref;
private static String trimBlockName(String name) {
String upper = name.toUpperCase();
StringBuffer result = new StringBuffer(upper.length());
for (int i = 0; i < upper.length(); i++) {
char c = upper.charAt(i);
if (c != ' ' && c != '_' && c != '-') {
result.append(c);
}
}
return result.toString();
}
/**
* Returns the type ID of this Unicode block
* @return integer type ID of this Unicode block
* @stable ICU 2.4
*/
public int getID()
{
return m_id_;
}
// private data members ---------------------------------------------
/**
* Array of UnicodeBlocks, for easy access in getInstance(int)
*/
private final static UnicodeBlock BLOCKS_[] = {
NO_BLOCK, BASIC_LATIN,
LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A,
LATIN_EXTENDED_B, IPA_EXTENSIONS,
SPACING_MODIFIER_LETTERS, COMBINING_DIACRITICAL_MARKS,
GREEK, CYRILLIC,
ARMENIAN, HEBREW,
ARABIC, SYRIAC,
THAANA, DEVANAGARI,
BENGALI, GURMUKHI,
GUJARATI, ORIYA,
TAMIL, TELUGU,
KANNADA, MALAYALAM,
SINHALA, THAI,
LAO, TIBETAN,
MYANMAR, GEORGIAN,
HANGUL_JAMO, ETHIOPIC,
CHEROKEE, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
OGHAM, RUNIC,
KHMER, MONGOLIAN,
LATIN_EXTENDED_ADDITIONAL, GREEK_EXTENDED,
GENERAL_PUNCTUATION, SUPERSCRIPTS_AND_SUBSCRIPTS,
CURRENCY_SYMBOLS, COMBINING_MARKS_FOR_SYMBOLS,
LETTERLIKE_SYMBOLS, NUMBER_FORMS,
ARROWS, MATHEMATICAL_OPERATORS,
MISCELLANEOUS_TECHNICAL, CONTROL_PICTURES,
OPTICAL_CHARACTER_RECOGNITION, ENCLOSED_ALPHANUMERICS,
BOX_DRAWING, BLOCK_ELEMENTS,
GEOMETRIC_SHAPES, MISCELLANEOUS_SYMBOLS,
DINGBATS, BRAILLE_PATTERNS,
CJK_RADICALS_SUPPLEMENT, KANGXI_RADICALS,
IDEOGRAPHIC_DESCRIPTION_CHARACTERS, CJK_SYMBOLS_AND_PUNCTUATION,
HIRAGANA, KATAKANA,
BOPOMOFO, HANGUL_COMPATIBILITY_JAMO,
KANBUN, BOPOMOFO_EXTENDED,
ENCLOSED_CJK_LETTERS_AND_MONTHS, CJK_COMPATIBILITY,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, CJK_UNIFIED_IDEOGRAPHS,
YI_SYLLABLES, YI_RADICALS,
HANGUL_SYLLABLES, HIGH_SURROGATES,
HIGH_PRIVATE_USE_SURROGATES, LOW_SURROGATES,
PRIVATE_USE_AREA, CJK_COMPATIBILITY_IDEOGRAPHS,
ALPHABETIC_PRESENTATION_FORMS, ARABIC_PRESENTATION_FORMS_A,
COMBINING_HALF_MARKS, CJK_COMPATIBILITY_FORMS,
SMALL_FORM_VARIANTS, ARABIC_PRESENTATION_FORMS_B,
SPECIALS, HALFWIDTH_AND_FULLWIDTH_FORMS,
OLD_ITALIC, GOTHIC,
DESERET, BYZANTINE_MUSICAL_SYMBOLS,
MUSICAL_SYMBOLS, MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
TAGS, CYRILLIC_SUPPLEMENT,
TAGALOG, HANUNOO,
BUHID, TAGBANWA,
MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, SUPPLEMENTAL_ARROWS_A,
SUPPLEMENTAL_ARROWS_B, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
KATAKANA_PHONETIC_EXTENSIONS,
VARIATION_SELECTORS, SUPPLEMENTARY_PRIVATE_USE_AREA_A,
SUPPLEMENTARY_PRIVATE_USE_AREA_B,
LIMBU, TAI_LE, KHMER_SYMBOLS, PHONETIC_EXTENSIONS,
MISCELLANEOUS_SYMBOLS_AND_ARROWS, YIJING_HEXAGRAM_SYMBOLS,
LINEAR_B_SYLLABARY, LINEAR_B_IDEOGRAMS, AEGEAN_NUMBERS,
UGARITIC, SHAVIAN, OSMANYA, CYPRIOT_SYLLABARY,
TAI_XUAN_JING_SYMBOLS, VARIATION_SELECTORS_SUPPLEMENT,
/* New blocks in Unicode 4.1 */
ANCIENT_GREEK_MUSICAL_NOTATION,
ANCIENT_GREEK_NUMBERS,
ARABIC_SUPPLEMENT,
BUGINESE,
CJK_STROKES,
COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
COPTIC,
ETHIOPIC_EXTENDED,
ETHIOPIC_SUPPLEMENT,
GEORGIAN_SUPPLEMENT,
GLAGOLITIC,
KHAROSHTHI,
MODIFIER_TONE_LETTERS,
NEW_TAI_LUE,
OLD_PERSIAN,
PHONETIC_EXTENSIONS_SUPPLEMENT,
SUPPLEMENTAL_PUNCTUATION,
SYLOTI_NAGRI,
TIFINAGH,
VERTICAL_FORMS,
NKO,
BALINESE,
LATIN_EXTENDED_C,
LATIN_EXTENDED_D,
PHAGS_PA,
PHOENICIAN,
CUNEIFORM,
CUNEIFORM_NUMBERS_AND_PUNCTUATION,
COUNTING_ROD_NUMERALS,
/* New blocks in Unicode 5.8 */
SUNDANESE,
LEPCHA,
OL_CHIKI,
CYRILLIC_EXTENDED_A,
VAI,
CYRILLIC_EXTENDED_B,
SAURASHTRA,
KAYAH_LI,
REJANG,
CHAM,
ANCIENT_SYMBOLS,
PHAISTOS_DISC,
LYCIAN,
CARIAN,
LYDIAN,
MAHJONG_TILES,
DOMINO_TILES,
};
static {
if (COUNT!=BLOCKS_.length) {
throw new java.lang.IllegalStateException("UnicodeBlock fields are inconsistent!");
}
}
/**
* Identification code for this UnicodeBlock
*/
private int m_id_;
// private constructor ----------------------------------------------
/**
* UnicodeBlock constructor
* @param name name of this UnicodeBlock
* @param id unique id of this UnicodeBlock
* @exception NullPointerException if name is null
*/
private UnicodeBlock(String name, int id)
{
super(name);
m_id_ = id;
}
}
/**
* East Asian Width constants.
* @see UProperty#EAST_ASIAN_WIDTH
* @see UCharacter#getIntPropertyValue
* @stable ICU 2.4
*/
public static interface EastAsianWidth
{
/**
* @stable ICU 2.4
*/
public static final int NEUTRAL = 0;
/**
* @stable ICU 2.4
*/
public static final int AMBIGUOUS = 1;
/**
* @stable ICU 2.4
*/
public static final int HALFWIDTH = 2;
/**
* @stable ICU 2.4
*/
public static final int FULLWIDTH = 3;
/**
* @stable ICU 2.4
*/
public static final int NARROW = 4;
/**
* @stable ICU 2.4
*/
public static final int WIDE = 5;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 6;
}
/**
* Decomposition Type constants.
* @see UProperty#DECOMPOSITION_TYPE
* @stable ICU 2.4
*/
public static interface DecompositionType
{
/**
* @stable ICU 2.4
*/
public static final int NONE = 0;
/**
* @stable ICU 2.4
*/
public static final int CANONICAL = 1;
/**
* @stable ICU 2.4
*/
public static final int COMPAT = 2;
/**
* @stable ICU 2.4
*/
public static final int CIRCLE = 3;
/**
* @stable ICU 2.4
*/
public static final int FINAL = 4;
/**
* @stable ICU 2.4
*/
public static final int FONT = 5;
/**
* @stable ICU 2.4
*/
public static final int FRACTION = 6;
/**
* @stable ICU 2.4
*/
public static final int INITIAL = 7;
/**
* @stable ICU 2.4
*/
public static final int ISOLATED = 8;
/**
* @stable ICU 2.4
*/
public static final int MEDIAL = 9;
/**
* @stable ICU 2.4
*/
public static final int NARROW = 10;
/**
* @stable ICU 2.4
*/
public static final int NOBREAK = 11;
/**
* @stable ICU 2.4
*/
public static final int SMALL = 12;
/**
* @stable ICU 2.4
*/
public static final int SQUARE = 13;
/**
* @stable ICU 2.4
*/
public static final int SUB = 14;
/**
* @stable ICU 2.4
*/
public static final int SUPER = 15;
/**
* @stable ICU 2.4
*/
public static final int VERTICAL = 16;
/**
* @stable ICU 2.4
*/
public static final int WIDE = 17;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 18;
}
/**
* Joining Type constants.
* @see UProperty#JOINING_TYPE
* @stable ICU 2.4
*/
public static interface JoiningType
{
/**
* @stable ICU 2.4
*/
public static final int NON_JOINING = 0;
/**
* @stable ICU 2.4
*/
public static final int JOIN_CAUSING = 1;
/**
* @stable ICU 2.4
*/
public static final int DUAL_JOINING = 2;
/**
* @stable ICU 2.4
*/
public static final int LEFT_JOINING = 3;
/**
* @stable ICU 2.4
*/
public static final int RIGHT_JOINING = 4;
/**
* @stable ICU 2.4
*/
public static final int TRANSPARENT = 5;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 6;
}
/**
* Joining Group constants.
* @see UProperty#JOINING_GROUP
* @stable ICU 2.4
*/
public static interface JoiningGroup
{
/**
* @stable ICU 2.4
*/
public static final int NO_JOINING_GROUP = 0;
/**
* @stable ICU 2.4
*/
public static final int AIN = 1;
/**
* @stable ICU 2.4
*/
public static final int ALAPH = 2;
/**
* @stable ICU 2.4
*/
public static final int ALEF = 3;
/**
* @stable ICU 2.4
*/
public static final int BEH = 4;
/**
* @stable ICU 2.4
*/
public static final int BETH = 5;
/**
* @stable ICU 2.4
*/
public static final int DAL = 6;
/**
* @stable ICU 2.4
*/
public static final int DALATH_RISH = 7;
/**
* @stable ICU 2.4
*/
public static final int E = 8;
/**
* @stable ICU 2.4
*/
public static final int FEH = 9;
/**
* @stable ICU 2.4
*/
public static final int FINAL_SEMKATH = 10;
/**
* @stable ICU 2.4
*/
public static final int GAF = 11;
/**
* @stable ICU 2.4
*/
public static final int GAMAL = 12;
/**
* @stable ICU 2.4
*/
public static final int HAH = 13;
/**
* @stable ICU 2.4
*/
public static final int HAMZA_ON_HEH_GOAL = 14;
/**
* @stable ICU 2.4
*/
public static final int HE = 15;
/**
* @stable ICU 2.4
*/
public static final int HEH = 16;
/**
* @stable ICU 2.4
*/
public static final int HEH_GOAL = 17;
/**
* @stable ICU 2.4
*/
public static final int HETH = 18;
/**
* @stable ICU 2.4
*/
public static final int KAF = 19;
/**
* @stable ICU 2.4
*/
public static final int KAPH = 20;
/**
* @stable ICU 2.4
*/
public static final int KNOTTED_HEH = 21;
/**
* @stable ICU 2.4
*/
public static final int LAM = 22;
/**
* @stable ICU 2.4
*/
public static final int LAMADH = 23;
/**
* @stable ICU 2.4
*/
public static final int MEEM = 24;
/**
* @stable ICU 2.4
*/
public static final int MIM = 25;
/**
* @stable ICU 2.4
*/
public static final int NOON = 26;
/**
* @stable ICU 2.4
*/
public static final int NUN = 27;
/**
* @stable ICU 2.4
*/
public static final int PE = 28;
/**
* @stable ICU 2.4
*/
public static final int QAF = 29;
/**
* @stable ICU 2.4
*/
public static final int QAPH = 30;
/**
* @stable ICU 2.4
*/
public static final int REH = 31;
/**
* @stable ICU 2.4
*/
public static final int REVERSED_PE = 32;
/**
* @stable ICU 2.4
*/
public static final int SAD = 33;
/**
* @stable ICU 2.4
*/
public static final int SADHE = 34;
/**
* @stable ICU 2.4
*/
public static final int SEEN = 35;
/**
* @stable ICU 2.4
*/
public static final int SEMKATH = 36;
/**
* @stable ICU 2.4
*/
public static final int SHIN = 37;
/**
* @stable ICU 2.4
*/
public static final int SWASH_KAF = 38;
/**
* @stable ICU 2.4
*/
public static final int SYRIAC_WAW = 39;
/**
* @stable ICU 2.4
*/
public static final int TAH = 40;
/**
* @stable ICU 2.4
*/
public static final int TAW = 41;
/**
* @stable ICU 2.4
*/
public static final int TEH_MARBUTA = 42;
/**
* @stable ICU 2.4
*/
public static final int TETH = 43;
/**
* @stable ICU 2.4
*/
public static final int WAW = 44;
/**
* @stable ICU 2.4
*/
public static final int YEH = 45;
/**
* @stable ICU 2.4
*/
public static final int YEH_BARREE = 46;
/**
* @stable ICU 2.4
*/
public static final int YEH_WITH_TAIL = 47;
/**
* @stable ICU 2.4
*/
public static final int YUDH = 48;
/**
* @stable ICU 2.4
*/
public static final int YUDH_HE = 49;
/**
* @stable ICU 2.4
*/
public static final int ZAIN = 50;
/**
* @stable ICU 2.6
*/
public static final int FE = 51;
/**
* @stable ICU 2.6
*/
public static final int KHAPH = 52;
/**
* @stable ICU 2.6
*/
public static final int ZHAIN = 53;
/**
* @stable ICU 4.0
*/
public static final int BURUSHASKI_YEH_BARREE = 54;
/**
* @stable ICU 4.0
*/
public static final int COUNT = 55;
}
/**
* Grapheme Cluster Break constants.
* @see UProperty#GRAPHEME_CLUSTER_BREAK
* @stable ICU 3.4
*/
public static interface GraphemeClusterBreak {
/**
* @stable ICU 3.4
*/
public static final int OTHER = 0;
/**
* @stable ICU 3.4
*/
public static final int CONTROL = 1;
/**
* @stable ICU 3.4
*/
public static final int CR = 2;
/**
* @stable ICU 3.4
*/
public static final int EXTEND = 3;
/**
* @stable ICU 3.4
*/
public static final int L = 4;
/**
* @stable ICU 3.4
*/
public static final int LF = 5;
/**
* @stable ICU 3.4
*/
public static final int LV = 6;
/**
* @stable ICU 3.4
*/
public static final int LVT = 7;
/**
* @stable ICU 3.4
*/
public static final int T = 8;
/**
* @stable ICU 3.4
*/
public static final int V = 9;
/**
* @stable ICU 4.0
*/
public static final int SPACING_MARK = 10;
/**
* @stable ICU 4.0
*/
public static final int PREPEND = 11;
/**
* @stable ICU 3.4
*/
public static final int COUNT = 12;
}
/**
* Word Break constants.
* @see UProperty#WORD_BREAK
* @stable ICU 3.4
*/
public static interface WordBreak {
/**
* @stable ICU 3.8
*/
public static final int OTHER = 0;
/**
* @stable ICU 3.8
*/
public static final int ALETTER = 1;
/**
* @stable ICU 3.8
*/
public static final int FORMAT = 2;
/**
* @stable ICU 3.8
*/
public static final int KATAKANA = 3;
/**
* @stable ICU 3.8
*/
public static final int MIDLETTER = 4;
/**
* @stable ICU 3.8
*/
public static final int MIDNUM = 5;
/**
* @stable ICU 3.8
*/
public static final int NUMERIC = 6;
/**
* @stable ICU 3.8
*/
public static final int EXTENDNUMLET = 7;
/**
* @stable ICU 4.0
*/
public static final int CR = 8;
/**
* @stable ICU 4.0
*/
public static final int EXTEND = 9;
/**
* @stable ICU 4.0
*/
public static final int LF = 10;
/**
* @stable ICU 4.0
*/
public static final int MIDNUMLET = 11;
/**
* @stable ICU 4.0
*/
public static final int NEWLINE = 12;
/**
* @stable ICU 4.0
*/
public static final int COUNT = 13;
}
/**
* Sentence Break constants.
* @see UProperty#SENTENCE_BREAK
* @stable ICU 3.4
*/
public static interface SentenceBreak {
/**
* @stable ICU 3.8
*/
public static final int OTHER = 0;
/**
* @stable ICU 3.8
*/
public static final int ATERM = 1;
/**
* @stable ICU 3.8
*/
public static final int CLOSE = 2;
/**
* @stable ICU 3.8
*/
public static final int FORMAT = 3;
/**
* @stable ICU 3.8
*/
public static final int LOWER = 4;
/**
* @stable ICU 3.8
*/
public static final int NUMERIC = 5;
/**
* @stable ICU 3.8
*/
public static final int OLETTER = 6;
/**
* @stable ICU 3.8
*/
public static final int SEP = 7;
/**
* @stable ICU 3.8
*/
public static final int SP = 8;
/**
* @stable ICU 3.8
*/
public static final int STERM = 9;
/**
* @stable ICU 3.8
*/
public static final int UPPER = 10;
/**
* @stable ICU 4.0
*/
public static final int CR = 11;
/**
* @stable ICU 4.0
*/
public static final int EXTEND = 12;
/**
* @stable ICU 4.0
*/
public static final int LF = 13;
/**
* @stable ICU 4.0
*/
public static final int SCONTINUE = 14;
/**
* @stable ICU 4.0
*/
public static final int COUNT = 15;
}
/**
* Line Break constants.
* @see UProperty#LINE_BREAK
* @stable ICU 2.4
*/
public static interface LineBreak
{
/**
* @stable ICU 2.4
*/
public static final int UNKNOWN = 0;
/**
* @stable ICU 2.4
*/
public static final int AMBIGUOUS = 1;
/**
* @stable ICU 2.4
*/
public static final int ALPHABETIC = 2;
/**
* @stable ICU 2.4
*/
public static final int BREAK_BOTH = 3;
/**
* @stable ICU 2.4
*/
public static final int BREAK_AFTER = 4;
/**
* @stable ICU 2.4
*/
public static final int BREAK_BEFORE = 5;
/**
* @stable ICU 2.4
*/
public static final int MANDATORY_BREAK = 6;
/**
* @stable ICU 2.4
*/
public static final int CONTINGENT_BREAK = 7;
/**
* @stable ICU 2.4
*/
public static final int CLOSE_PUNCTUATION = 8;
/**
* @stable ICU 2.4
*/
public static final int COMBINING_MARK = 9;
/**
* @stable ICU 2.4
*/
public static final int CARRIAGE_RETURN = 10;
/**
* @stable ICU 2.4
*/
public static final int EXCLAMATION = 11;
/**
* @stable ICU 2.4
*/
public static final int GLUE = 12;
/**
* @stable ICU 2.4
*/
public static final int HYPHEN = 13;
/**
* @stable ICU 2.4
*/
public static final int IDEOGRAPHIC = 14;
/**
* @see #INSEPARABLE
* @stable ICU 2.4
*/
public static final int INSEPERABLE = 15;
/**
* Renamed from the misspelled "inseperable" in Unicode 4.0.1.
* @stable ICU 3.0
*/
public static final int INSEPARABLE = 15;
/**
* @stable ICU 2.4
*/
public static final int INFIX_NUMERIC = 16;
/**
* @stable ICU 2.4
*/
public static final int LINE_FEED = 17;
/**
* @stable ICU 2.4
*/
public static final int NONSTARTER = 18;
/**
* @stable ICU 2.4
*/
public static final int NUMERIC = 19;
/**
* @stable ICU 2.4
*/
public static final int OPEN_PUNCTUATION = 20;
/**
* @stable ICU 2.4
*/
public static final int POSTFIX_NUMERIC = 21;
/**
* @stable ICU 2.4
*/
public static final int PREFIX_NUMERIC = 22;
/**
* @stable ICU 2.4
*/
public static final int QUOTATION = 23;
/**
* @stable ICU 2.4
*/
public static final int COMPLEX_CONTEXT = 24;
/**
* @stable ICU 2.4
*/
public static final int SURROGATE = 25;
/**
* @stable ICU 2.4
*/
public static final int SPACE = 26;
/**
* @stable ICU 2.4
*/
public static final int BREAK_SYMBOLS = 27;
/**
* @stable ICU 2.4
*/
public static final int ZWSPACE = 28;
/**
* @stable ICU 2.6
*/
public static final int NEXT_LINE = 29; /*[NL]*/ /* from here on: new in Unicode 4/ICU 2.6 */
/**
* @stable ICU 2.6
*/
public static final int WORD_JOINER = 30; /*[WJ]*/
/* from here on: new in Unicode 4.1/ICU 3.4 */
/**
* @stable ICU 3.4
*/
public static final int H2 = 31;
/**
* @stable ICU 3.4
*/
public static final int H3 = 32;
/**
* @stable ICU 3.4
*/
public static final int JL = 33;
/**
* @stable ICU 3.4
*/
public static final int JT = 34;
/**
* @stable ICU 3.4
*/
public static final int JV = 35;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 36;
}
/**
* Numeric Type constants.
* @see UProperty#NUMERIC_TYPE
* @stable ICU 2.4
*/
public static interface NumericType
{
/**
* @stable ICU 2.4
*/
public static final int NONE = 0;
/**
* @stable ICU 2.4
*/
public static final int DECIMAL = 1;
/**
* @stable ICU 2.4
*/
public static final int DIGIT = 2;
/**
* @stable ICU 2.4
*/
public static final int NUMERIC = 3;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 4;
}
/**
* Hangul Syllable Type constants.
*
* @see UProperty#HANGUL_SYLLABLE_TYPE
* @stable ICU 2.6
*/
public static interface HangulSyllableType
{
/**
* @stable ICU 2.6
*/
public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
/**
* @stable ICU 2.6
*/
public static final int LEADING_JAMO = 1; /*[L]*/
/**
* @stable ICU 2.6
*/
public static final int VOWEL_JAMO = 2; /*[V]*/
/**
* @stable ICU 2.6
*/
public static final int TRAILING_JAMO = 3; /*[T]*/
/**
* @stable ICU 2.6
*/
public static final int LV_SYLLABLE = 4; /*[LV]*/
/**
* @stable ICU 2.6
*/
public static final int LVT_SYLLABLE = 5; /*[LVT]*/
/**
* @stable ICU 2.6
*/
public static final int COUNT = 6;
}
// public data members -----------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).
* Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
/**
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE =
UTF16.SUPPLEMENTARY_MIN_VALUE;
/**
* Unicode value used when translating into Unicode encoding form and there
* is no existing character.
* @stable ICU 2.1
*/
public static final int REPLACEMENT_CHAR = '\uFFFD';
/**
* Special value that is returned by getUnicodeNumericValue(int) when no
* numeric value is defined for a code point.
* @stable ICU 2.4
* @see #getUnicodeNumericValue
*/
public static final double NO_NUMERIC_VALUE = -123456789;
/**
* Compatibility constant for Java Character's MIN_RADIX.
* @stable ICU 3.4
*/
public static final int MIN_RADIX = java.lang.Character.MIN_RADIX;
/**
* Compatibility constant for Java Character's MAX_RADIX.
* @stable ICU 3.4
*/
public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will titlecase the first cased character
* of a word and lowercase all other characters.
* With this option, the other characters will not be modified.
*
* @see #toTitleCase
* @stable ICU 3.8
*/
public static final int TITLECASE_NO_LOWERCASE = 0x100;
/**
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
* adjust it by looking for the next cased character, and titlecase that one.
* Other characters are lowercased.
*
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* @see #toTitleCase
* @see #TITLECASE_NO_LOWERCASE
* @stable ICU 3.8
*/
public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
// public methods ----------------------------------------------------
/**
* Retrieves the numeric value of a decimal digit code point.
*
This method observes the semantics of
* java.lang.Character.digit()
. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
*
Semantic Change: In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as
* digits.
* This has been changed to conform to the java semantics.
*
A code point is a valid digit if and only if:
*
*
* @param ch the code point to query
* @param radix the radix
* @return the numeric value represented by the code point in the
* specified radix, or -1 if the code point is not a decimal digit
* or if its value is too large for the radix
* @stable ICU 2.1
*/
public static int digit(int ch, int radix)
{
// when ch is out of bounds getProperty == 0
int props = getProperty(ch);
int value;
if (getNumericType(props) == NumericType.DECIMAL) {
value = UCharacterProperty.getUnsignedValue(props);
} else {
value = getEuropeanDigit(ch);
}
return (0 <= value && value < radix) ? value : -1;
}
/**
* Retrieves the numeric value of a decimal digit code point.
*
This is a convenience overload of digit(int, int)
* that provides a decimal radix.
*
Semantic Change: In release 1.3.1 and prior, this
* treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix
* @stable ICU 2.1
*/
public static int digit(int ch)
{
int props = getProperty(ch);
if (getNumericType(props) == NumericType.DECIMAL) {
return UCharacterProperty.getUnsignedValue(props);
} else {
return -1;
}
}
/**
* Returns the numeric value of the code point as a nonnegative
* integer.
*
If the code point does not have a numeric value, then -1 is returned.
*
* If the code point has a numeric value that cannot be represented as a
* nonnegative integer (for example, a fractional value), then -2 is
* returned.
* @param ch the code point to query
* @return the numeric value of the code point, or -1 if it has no numeric
* value, or -2 if it has a numeric value that cannot be represented as a
* nonnegative integer
* @stable ICU 2.1
*/
public static int getNumericValue(int ch)
{
// slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
int props = PROPERTY_.getProperty(ch);
int numericType = getNumericType(props);
if(numericType==0) {
return getEuropeanDigit(ch);
}
if(numericType==UCharacterProperty.NT_FRACTION || numericType>=UCharacterProperty.NT_COUNT) {
return -2;
}
int numericValue = UCharacterProperty.getUnsignedValue(props);
if(numericType
* Return results are constants from the interface
* UCharacterCategory
* NOTE: the UCharacterCategory values are not compatible with
* those returned by java.lang.Character.getType. UCharacterCategory values
* match the ones used in ICU4C, while java.lang.Character type
* values, though similar, skip the value 17.
java.lang.Character.isDigit()
. It returns true for decimal
* digits only.
* This function only returns the simple, single-code point case mapping.
* Full case mappings should be used whenever possible because they produce
* better results by working on whole strings.
* They take into account the string context and the language and can map
* to a result string with a different length as appropriate.
* Full case mappings are applied by the case mapping functions
* that take String parameters rather than code points (int).
* See also the User Guide chapter on C/POSIX migration:
* http://www.icu-project.org/userguide/posix.html#case_mappings
*
* @param ch code point whose lowercase equivalent is to be retrieved
* @return the lowercase equivalent code point
* @stable ICU 2.1
*/
public static int toLowerCase(int ch) {
return gCsp.tolower(ch);
}
/**
* Converts argument code point and returns a String object representing
* the code point's value in UTF16 format.
* The result is a string whose length is 1 for non-supplementary code
* points, 2 otherwise.
* com.ibm.ibm.icu.UTF16 can be used to parse Strings generated by this
* function.
* Up-to-date Unicode implementation of java.lang.Character.toString()
* @param ch code point
* @return string representation of the code point, null if code point is not
* defined in unicode
* @stable ICU 2.1
*/
public static String toString(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
return null;
}
if (ch < SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char)ch);
}
StringBuffer result = new StringBuffer();
result.append(UTF16.getLeadSurrogate(ch));
result.append(UTF16.getTrailSurrogate(ch));
return result.toString();
}
/**
* Converts the code point argument to titlecase.
* If no titlecase is available, the uppercase is returned. If no uppercase
* is available, the code point itself is returned.
* Up-to-date Unicode implementation of java.lang.Character.toTitleCase()
*
*
This function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They take into account the string context and the language and can map * to a result string with a different length as appropriate. * Full case mappings are applied by the case mapping functions * that take String parameters rather than code points (int). * See also the User Guide chapter on C/POSIX migration: * http://www.icu-project.org/userguide/posix.html#case_mappings * * @param ch code point whose title case is to be retrieved * @return titlecase code point * @stable ICU 2.1 */ public static int toTitleCase(int ch) { return gCsp.totitle(ch); } /** * Converts the character argument to uppercase. * If no uppercase is available, the character itself is returned. * Up-to-date Unicode implementation of java.lang.Character.toUpperCase() * *
This function only returns the simple, single-code point case mapping.
* Full case mappings should be used whenever possible because they produce
* better results by working on whole strings.
* They take into account the string context and the language and can map
* to a result string with a different length as appropriate.
* Full case mappings are applied by the case mapping functions
* that take String parameters rather than code points (int).
* See also the User Guide chapter on C/POSIX migration:
* http://www.icu-project.org/userguide/posix.html#case_mappings
*
* @param ch code point whose uppercase is to be retrieved
* @return uppercase code point
* @stable ICU 2.1
*/
public static int toUpperCase(int ch) {
return gCsp.toupper(ch);
}
// extra methods not in java.lang.Character --------------------------
/**
* Determines if the code point is a supplementary character.
* A code point is a supplementary character if and only if it is greater
* than SUPPLEMENTARY_MIN_VALUE
* @param ch code point to be determined if it is in the supplementary
* plane
* @return true if code point is a supplementary character
* @stable ICU 2.1
*/
public static boolean isSupplementary(int ch)
{
return ch >= UCharacter.SUPPLEMENTARY_MIN_VALUE &&
ch <= UCharacter.MAX_VALUE;
}
/**
* Determines if the code point is in the BMP plane.
* @param ch code point to be determined if it is not a supplementary
* character
* @return true if code point is not a supplementary character
* @stable ICU 2.1
*/
public static boolean isBMP(int ch)
{
return (ch >= 0 && ch <= LAST_CHAR_MASK_);
}
/**
* Determines whether the specified code point is a printable character
* according to the Unicode standard.
* @param ch code point to be determined if it is printable
* @return true if the code point is a printable character
* @stable ICU 2.1
*/
public static boolean isPrintable(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return (cat != UCharacterCategory.UNASSIGNED &&
cat != UCharacterCategory.CONTROL &&
cat != UCharacterCategory.FORMAT &&
cat != UCharacterCategory.PRIVATE_USE &&
cat != UCharacterCategory.SURROGATE &&
cat != UCharacterCategory.GENERAL_OTHER_TYPES);
}
/**
* Determines whether the specified code point is of base form.
* A code point of base form does not graphically combine with preceding
* characters, and is neither a control nor a format character.
* @param ch code point to be determined if it is of base form
* @return true if the code point is of base form
* @stable ICU 2.1
*/
public static boolean isBaseForm(int ch)
{
int cat = getType(ch);
// if props == 0, it will just fall through and return false
return cat == UCharacterCategory.DECIMAL_DIGIT_NUMBER ||
cat == UCharacterCategory.OTHER_NUMBER ||
cat == UCharacterCategory.LETTER_NUMBER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.TITLECASE_LETTER ||
cat == UCharacterCategory.MODIFIER_LETTER ||
cat == UCharacterCategory.OTHER_LETTER ||
cat == UCharacterCategory.NON_SPACING_MARK ||
cat == UCharacterCategory.ENCLOSING_MARK ||
cat == UCharacterCategory.COMBINING_SPACING_MARK;
}
/**
* Returns the Bidirection property of a code point.
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.
* Result returned belongs to the interface
* UCharacterDirection
* @param ch the code point to be determined its direction
* @return direction constant from UCharacterDirection.
* @stable ICU 2.1
*/
public static int getDirection(int ch)
{
return gBdp.getClass(ch);
}
/**
* Determines whether the code point has the "mirrored" property.
* This property is set for characters that are commonly used in
* Right-To-Left contexts and need to be displayed with a "mirrored"
* glyph.
* @param ch code point whose mirror is to be determined
* @return true if the code point has the "mirrored" property
* @stable ICU 2.1
*/
public static boolean isMirrored(int ch)
{
return gBdp.isMirrored(ch);
}
/**
* Maps the specified code point to a "mirror-image" code point.
* For code points with the "mirrored" property, implementations sometimes
* need a "poor man's" mapping to another code point such that the default
* glyph may serve as the mirror-image of the default glyph of the
* specified code point.
* This is useful for text conversion to and from codepages with visual
* order, and for displays without glyph selection capabilities.
* @param ch code point whose mirror is to be retrieved
* @return another code point that may serve as a mirror-image substitute,
* or ch itself if there is no such mapping or ch does not have the
* "mirrored" property
* @stable ICU 2.1
*/
public static int getMirror(int ch)
{
return gBdp.getMirror(ch);
}
/**
* Gets the combining class of the argument codepoint
* @param ch code point whose combining is to be retrieved
* @return the combining class of the codepoint
* @stable ICU 2.1
*/
public static int getCombiningClass(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
throw new IllegalArgumentException("Codepoint out of bounds");
}
return NormalizerImpl.getCombiningClass(ch);
}
/**
* A code point is illegal if and only if
*
Retrieves a name for a valid codepoint. Unlike, getName(int) and * getName1_0(int), this method will return a name even for codepoints that * are not assigned a name in UnicodeData.txt. *
* The names are returned in the following order. *0<=c<=0x10ffff
.
* @return The ISO comment, or null if there is no comment for this
* character.
* @stable ICU 2.4
*/
public static String getISOComment(int ch)
{
if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE) {
return null;
}
if(NAME_==null){
throw new MissingResourceException("Could not load unames.icu","","");
}
String result = NAME_.getGroupName(ch,
UCharacterNameChoice.ISO_COMMENT_);
return result;
}
/**
* Find a Unicode code point by its most current Unicode name and * return its code point value. All Unicode names are in uppercase.
* Note calling any methods related to code point names, e.g. get*Name*() * incurs a one-time initialisation cost to construct the name tables. * @param name most current Unicode character name whose code point is to * be returned * @return code point or -1 if name is not found * @stable ICU 2.1 */ public static int getCharFromName(String name) { if(NAME_==null){ throw new MissingResourceException("Could not load unames.icu","",""); } return NAME_.getCharFromName( UCharacterNameChoice.UNICODE_CHAR_NAME, name); } /** *Find a Unicode character by its version 1.0 Unicode name and return * its code point value. All Unicode names are in uppercase.
* Note calling any methods related to code point names, e.g. get*Name*() * incurs a one-time initialisation cost to construct the name tables. * @param name Unicode 1.0 code point name whose code point is to * returned * @return code point or -1 if name is not found * @stable ICU 2.1 */ public static int getCharFromName1_0(String name) { if(NAME_==null){ throw new MissingResourceException("Could not load unames.icu","",""); } return NAME_.getCharFromName( UCharacterNameChoice.UNICODE_10_CHAR_NAME, name); } /** *Find a Unicode character by either its name and return its code * point value. All Unicode names are in uppercase. * Extended names are all lowercase except for numbers and are contained * within angle brackets.
* The names are searched in the following order *Gets the titlecase version of the argument string.
*Position for titlecasing is determined by the argument break * iterator, hence the user can customize his break iterator for * a specialized titlecasing. In this case only the forward iteration * needs to be implemented. * If the break iterator passed in is null, the default Unicode algorithm * will be used to determine the titlecase positions. *
*Only positions returned by the break iterator will be title cased, * character in between the positions will all be in lower case.
*Casing is dependent on the default locale and context-sensitive
* @param str source string to be performed on * @param breakiter break iterator to determine the positions in which * the character should be title cased. * @return lowercase version of the argument string * @stable ICU 2.6 */ public static String toTitleCase(String str, BreakIterator breakiter) { return toTitleCase(ULocale.getDefault(), str, breakiter); } /** * Gets uppercase version of the argument string. * Casing is dependent on the argument locale and context-sensitive. * @param locale which string is to be converted in * @param str source string to be performed on * @return uppercase version of the argument string * @stable ICU 2.1 */ public static String toUpperCase(Locale locale, String str) { return toUpperCase(ULocale.forLocale(locale), str); } /** * Gets uppercase version of the argument string. * Casing is dependent on the argument locale and context-sensitive. * @param locale which string is to be converted in * @param str source string to be performed on * @return uppercase version of the argument string * @stable ICU 3.2 */ public static String toUpperCase(ULocale locale, String str) { StringContextIterator iter = new StringContextIterator(str); StringBuffer result = new StringBuffer(str.length()); int[] locCache = new int[1]; int c; if (locale == null) { locale = ULocale.getDefault(); } locCache[0]=0; while((c=iter.nextCaseMapCP())>=0) { c=gCsp.toFullUpper(c, iter, result, locale, locCache); /* decode the result */ if(c<0) { /* (not) original code point */ c=~c; } else if(c<=UCaseProps.MAX_STRING_LENGTH) { /* mapping already appended to result */ continue; /* } else { append single-code point mapping */ } if(c<=0xffff) { result.append((char)c); } else { UTF16.append(result, c); } } return result.toString(); } /** * Gets lowercase version of the argument string. * Casing is dependent on the argument locale and context-sensitive * @param locale which string is to be converted in * @param str source string to be performed on * @return lowercase version of the argument string * @stable ICU 2.1 */ public static String toLowerCase(Locale locale, String str) { return toLowerCase(ULocale.forLocale(locale), str); } /** * Gets lowercase version of the argument string. * Casing is dependent on the argument locale and context-sensitive * @param locale which string is to be converted in * @param str source string to be performed on * @return lowercase version of the argument string * @stable ICU 3.2 */ public static String toLowerCase(ULocale locale, String str) { StringContextIterator iter = new StringContextIterator(str); StringBuffer result = new StringBuffer(str.length()); int[] locCache = new int[1]; int c; if (locale == null) { locale = ULocale.getDefault(); } locCache[0]=0; while((c=iter.nextCaseMapCP())>=0) { c=gCsp.toFullLower(c, iter, result, locale, locCache); /* decode the result */ if(c<0) { /* (not) original code point */ c=~c; } else if(c<=UCaseProps.MAX_STRING_LENGTH) { /* mapping already appended to result */ continue; /* } else { append single-code point mapping */ } if(c<=0xffff) { result.append((char)c); } else { UTF16.append(result, c); } } return result.toString(); } /** *Gets the titlecase version of the argument string.
*Position for titlecasing is determined by the argument break * iterator, hence the user can customize his break iterator for * a specialized titlecasing. In this case only the forward iteration * needs to be implemented. * If the break iterator passed in is null, the default Unicode algorithm * will be used to determine the titlecase positions. *
*Only positions returned by the break iterator will be title cased, * character in between the positions will all be in lower case.
*Casing is dependent on the argument locale and context-sensitive
* @param locale which string is to be converted in * @param str source string to be performed on * @param breakiter break iterator to determine the positions in which * the character should be title cased. * @return lowercase version of the argument string * @stable ICU 2.6 */ public static String toTitleCase(Locale locale, String str, BreakIterator breakiter) { return toTitleCase(ULocale.forLocale(locale), str, breakiter); } /** *Gets the titlecase version of the argument string.
*Position for titlecasing is determined by the argument break * iterator, hence the user can customize his break iterator for * a specialized titlecasing. In this case only the forward iteration * needs to be implemented. * If the break iterator passed in is null, the default Unicode algorithm * will be used to determine the titlecase positions. *
*Only positions returned by the break iterator will be title cased, * character in between the positions will all be in lower case.
*Casing is dependent on the argument locale and context-sensitive
* @param locale which string is to be converted in * @param str source string to be performed on * @param titleIter break iterator to determine the positions in which * the character should be title cased. * @return lowercase version of the argument string * @stable ICU 3.2 */ public static String toTitleCase(ULocale locale, String str, BreakIterator titleIter) { return toTitleCase(locale, str, titleIter, 0); } /** *Gets the titlecase version of the argument string.
*Position for titlecasing is determined by the argument break * iterator, hence the user can customize his break iterator for * a specialized titlecasing. In this case only the forward iteration * needs to be implemented. * If the break iterator passed in is null, the default Unicode algorithm * will be used to determine the titlecase positions. *
*Only positions returned by the break iterator will be title cased, * character in between the positions will all be in lower case.
*Casing is dependent on the argument locale and context-sensitive
* @param locale which string is to be converted in * @param str source string to be performed on * @param titleIter break iterator to determine the positions in which * the character should be title cased. * @param options bit set to modify the titlecasing operation * @return lowercase version of the argument string * @stable ICU 3.8 * @see #TITLECASE_NO_LOWERCASE * @see #TITLECASE_NO_BREAK_ADJUSTMENT */ public static String toTitleCase(ULocale locale, String str, BreakIterator titleIter, int options) { StringContextIterator iter = new StringContextIterator(str); StringBuffer result = new StringBuffer(str.length()); int[] locCache = new int[1]; int c, nc, srcLength = str.length(); if (locale == null) { locale = ULocale.getDefault(); } locCache[0]=0; if(titleIter == null) { titleIter = BreakIterator.getWordInstance(locale); } titleIter.setText(str); int prev, titleStart, index; boolean isFirstIndex; boolean isDutch = locale.getLanguage().equals("nl"); boolean FirstIJ = true; /* set up local variables */ prev=0; isFirstIndex=true; /* titlecasing loop */ while(prevThis function only returns the simple, single-code point case mapping. * Full case mappings should be used whenever possible because they produce * better results by working on whole strings. * They can map to a result string with a different length as appropriate. * Full case mappings are applied by the case mapping functions * that take String parameters rather than code points (int). * See also the User Guide chapter on C/POSIX migration: * http://www.icu-project.org/userguide/posix.html#case_mappings * * @param ch the character to be converted * @param defaultmapping Indicates if all mappings defined in * CaseFolding.txt is to be used, otherwise the * mappings for dotted I and dotless i marked with * 'I' in CaseFolding.txt will be skipped. * @return the case folding equivalent of the character, if * any; otherwise the character itself. * @see #foldCase(String, boolean) * @stable ICU 2.1 */ public static int foldCase(int ch, boolean defaultmapping) { return foldCase(ch, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I); } /** * The given string is mapped to its case folding equivalent according to * UnicodeData.txt and CaseFolding.txt; if any character has no case * folding equivalent, the character itself is returned. * "Full", multiple-code point case folding mappings are returned here. * For "simple" single-code point mappings use the API * foldCase(int ch, boolean defaultmapping). * @param str the String to be converted * @param defaultmapping Indicates if all mappings defined in * CaseFolding.txt is to be used, otherwise the * mappings for dotted I and dotless i marked with * 'I' in CaseFolding.txt will be skipped. * @return the case folding equivalent of the character, if * any; otherwise the character itself. * @see #foldCase(int, boolean) * @stable ICU 2.1 */ public static String foldCase(String str, boolean defaultmapping) { return foldCase(str, defaultmapping ? FOLD_CASE_DEFAULT : FOLD_CASE_EXCLUDE_SPECIAL_I); } /** * Option value for case folding: use default mappings defined in CaseFolding.txt. * @stable ICU 2.6 */ public static final int FOLD_CASE_DEFAULT = 0x0000; /** * Option value for case folding: exclude the mappings for dotted I * and dotless i marked with 'I' in CaseFolding.txt. * @stable ICU 2.6 */ public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = 0x0001; /** * The given character is mapped to its case folding equivalent according * to UnicodeData.txt and CaseFolding.txt; if the character has no case * folding equivalent, the character itself is returned. * *
This function only returns the simple, single-code point case mapping.
* Full case mappings should be used whenever possible because they produce
* better results by working on whole strings.
* They can map to a result string with a different length as appropriate.
* Full case mappings are applied by the case mapping functions
* that take String parameters rather than code points (int).
* See also the User Guide chapter on C/POSIX migration:
* http://www.icu-project.org/userguide/posix.html#case_mappings
*
* @param ch the character to be converted
* @param options A bit set for special processing. Currently the recognised options are
* FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
* @return the case folding equivalent of the character, if
* any; otherwise the character itself.
* @see #foldCase(String, boolean)
* @stable ICU 2.6
*/
public static int foldCase(int ch, int options) {
return gCsp.fold(ch, options);
}
/**
* The given string is mapped to its case folding equivalent according to
* UnicodeData.txt and CaseFolding.txt; if any character has no case
* folding equivalent, the character itself is returned.
* "Full", multiple-code point case folding mappings are returned here.
* For "simple" single-code point mappings use the API
* foldCase(int ch, boolean defaultmapping).
* @param str the String to be converted
* @param options A bit set for special processing. Currently the recognised options are
* FOLD_CASE_EXCLUDE_SPECIAL_I and FOLD_CASE_DEFAULT
* @return the case folding equivalent of the character, if
* any; otherwise the character itself.
* @see #foldCase(int, boolean)
* @stable ICU 2.6
*/
public static final String foldCase(String str, int options) {
StringBuffer result = new StringBuffer(str.length());
int c, i, length;
length = str.length();
for(i=0; i Gets an iterator for character types, iterating over codepoints. Gets an iterator for character names, iterating over codepoints. This API only gets the iterator for the modern, most up-to-date
* Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
* for extended names use getExtendedNameIterator(). The maximal range which the name iterator iterates is from
* UCharacter.MIN_VALUE to UCharacter.MAX_VALUE. Gets an iterator for character names, iterating over codepoints. This API only gets the iterator for the older 1.0 Unicode names.
* For modern, most up-to-date Unicode names use getNameIterator() or
* for extended names use getExtendedNameIterator(). The maximal range which the name iterator iterates is from
* @return an iterator
* @stable ICU 2.6
*/
public static ValueIterator getName1_0Iterator()
{
if(NAME_==null){
throw new RuntimeException("Could not load unames.icu");
}
return new UCharacterNameIterator(NAME_,
UCharacterNameChoice.UNICODE_10_CHAR_NAME);
}
/**
* Gets an iterator for character names, iterating over codepoints. This API only gets the iterator for the extended names.
* For modern, most up-to-date Unicode names use getNameIterator() or
* for older 1.0 Unicode names use get1_0NameIterator(). The maximal range which the name iterator iterates is from
* @return an iterator
* @stable ICU 2.6
*/
public static ValueIterator getExtendedNameIterator()
{
if(NAME_==null){
throw new MissingResourceException("Could not load unames.icu","","");
}
return new UCharacterNameIterator(NAME_,
UCharacterNameChoice.EXTENDED_CHAR_NAME);
}
/**
* Get the "age" of the code point. The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
* This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters. The data is from the UCD file DerivedAge.txt. Check a binary Unicode property for a code point. Unicode, especially in version 3.2, defines many more properties
* than the original set in UnicodeData.txt. This API is intended to reflect Unicode properties as defined in
* the Unicode Character Database (UCD) and Unicode Technical Reports
* (UTR). For details about the properties see
* http://www.unicode.org/. For names of Unicode properties see the UCD file
* PropertyAliases.txt. This API does not check the validity of the codepoint. Important: If ICU is built with UCD files from Unicode versions
* below 3.2, then properties marked with "new" are not or
* not fully available. Check if a code point has the Alphabetic Unicode property. Same as UCharacter.hasBinaryProperty(ch, UProperty.ALPHABETIC). Different from UCharacter.isLetter(ch)! Check if a code point has the Lowercase Unicode property. Same as UCharacter.hasBinaryProperty(ch, UProperty.LOWERCASE). This is different from UCharacter.isLowerCase(ch)! Check if a code point has the Uppercase Unicode property. Same as UCharacter.hasBinaryProperty(ch, UProperty.UPPERCASE). This is different from UCharacter.isUpperCase(ch)! Check if a code point has the White_Space Unicode property. Same as UCharacter.hasBinaryProperty(ch, UProperty.WHITE_SPACE). This is different from both UCharacter.isSpace(ch) and
* UCharacter.isWhitespace(ch)! Gets the property value for an Unicode property type of a code point.
* Also returns binary and mask property values. Unicode, especially in version 3.2, defines many more properties than
* the original set in UnicodeData.txt. The properties APIs are intended to reflect Unicode properties as
* defined in the Unicode Character Database (UCD) and Unicode Technical
* Reports (UTR). For details about the properties see
* http://www.unicode.org/. For names of Unicode properties see the UCD file PropertyAliases.txt.
*
*
* RangeValueIterator iterator = UCharacter.getTypeIterator();
* RangeValueIterator.Element element = new RangeValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.start) +
* " to codepoint \\u" +
* Integer.toHexString(element.limit - 1) +
* " has the character type " +
* element.value);
* }
*
* @return an iterator
* @stable ICU 2.6
*/
public static RangeValueIterator getTypeIterator()
{
return new UCharacterTypeIterator(PROPERTY_);
}
/**
*
*
* ValueIterator iterator = UCharacter.getNameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
*
*
*
* ValueIterator iterator = UCharacter.get1_0NameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
*
*
*
* ValueIterator iterator = UCharacter.getExtendedNameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
*
*
* Sample usage:
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
* boolean b = (ideo == 1) ? true : false;
*
* @param ch code point to test.
* @param type UProperty selector constant, identifies which binary
* property to check. Must be
* UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
* UProperty.INT_START <= type < UProperty.INT_LIMIT or
* UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
* @return numeric value that is directly the property value or,
* for enumerated properties, corresponds to the numeric value of
* the enumerated constant of the respective property value
* enumeration type (cast to enum type if necessary).
* Returns 0 or 1 (for false / true) for binary Unicode properties.
* Returns a bit-mask for mask properties.
* Returns 0 if 'type' is out of bounds or if the Unicode version
* does not have data for the property at all, or not for this code
* point.
* @see UProperty
* @see #hasBinaryProperty
* @see #getIntPropertyMinValue
* @see #getIntPropertyMaxValue
* @see #getUnicodeVersion
* @stable ICU 2.4
*/
public static int getIntPropertyValue(int ch, int type)
{
if (type < UProperty.BINARY_START) {
return 0; // undefined
}
else if (type < UProperty.BINARY_LIMIT) {
return hasBinaryProperty(ch, type) ? 1 : 0;
}
else if (type < UProperty.INT_START) {
return 0; // undefined
}
else if (type < UProperty.INT_LIMIT) {
//int result = 0;
switch (type) {
case UProperty.BIDI_CLASS:
return getDirection(ch);
case UProperty.BLOCK:
return UnicodeBlock.idOf(ch);
case UProperty.CANONICAL_COMBINING_CLASS:
return getCombiningClass(ch);
case UProperty.DECOMPOSITION_TYPE:
return PROPERTY_.getAdditional(ch, 2)
& DECOMPOSITION_TYPE_MASK_;
case UProperty.EAST_ASIAN_WIDTH:
return (PROPERTY_.getAdditional(ch, 0)
& EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
case UProperty.GENERAL_CATEGORY:
return getType(ch);
case UProperty.JOINING_GROUP:
return gBdp.getJoiningGroup(ch);
case UProperty.JOINING_TYPE:
return gBdp.getJoiningType(ch);
case UProperty.LINE_BREAK:
return (int)(PROPERTY_.getAdditional(ch, LB_VWORD)& LB_MASK)>>LB_SHIFT;
case UProperty.NUMERIC_TYPE:
type=getNumericType(PROPERTY_.getProperty(ch));
if(type>NumericType.NUMERIC) {
/* keep internal variants of NumericType.NUMERIC from becoming visible */
type=NumericType.NUMERIC;
}
return type;
case UProperty.SCRIPT:
return UScript.getScript(ch);
case UProperty.HANGUL_SYLLABLE_TYPE:
/* purely algorithmic; hardcode known characters, check for assigned new ones */
if(ch
*
* For undefined UProperty constant values, min/max values will be 0/-1.
* @param type UProperty selector constant, identifies which binary
* property to check. Must be
* UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
* UProperty.INT_START <= type < UProperty.INT_LIMIT.
* @return Maximum value returned by u_getIntPropertyValue for a Unicode
* property. <= 0 if the property selector 'type' is out of range.
* @see UProperty
* @see #hasBinaryProperty
* @see #getUnicodeVersion
* @see #getIntPropertyMaxValue
* @see #getIntPropertyValue
* @stable ICU 2.4
*/
public static int getIntPropertyMaxValue(int type)
{
if (type < UProperty.BINARY_START) {
return -1; // undefined
}
else if (type < UProperty.BINARY_LIMIT) {
return 1; // maximum TRUE for all binary properties
}
else if (type < UProperty.INT_START) {
return -1; // undefined
}
else if (type < UProperty.INT_LIMIT) {
switch (type) {
case UProperty.BIDI_CLASS:
case UProperty.JOINING_GROUP:
case UProperty.JOINING_TYPE:
return gBdp.getMaxValue(type);
case UProperty.BLOCK:
return (PROPERTY_.getMaxValues(0) & BLOCK_MASK_) >> BLOCK_SHIFT_;
case UProperty.CANONICAL_COMBINING_CLASS:
case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
return 0xff; // TODO do we need to be more precise,
// getting the actual maximum?
case UProperty.DECOMPOSITION_TYPE:
return PROPERTY_.getMaxValues(2) & DECOMPOSITION_TYPE_MASK_;
case UProperty.EAST_ASIAN_WIDTH:
return (PROPERTY_.getMaxValues(0) & EAST_ASIAN_MASK_) >> EAST_ASIAN_SHIFT_;
case UProperty.GENERAL_CATEGORY:
return UCharacterCategory.CHAR_CATEGORY_COUNT - 1;
case UProperty.LINE_BREAK:
return (PROPERTY_.getMaxValues(LB_VWORD) & LB_MASK) >> LB_SHIFT;
case UProperty.NUMERIC_TYPE:
return NumericType.COUNT - 1;
case UProperty.SCRIPT:
return PROPERTY_.getMaxValues(0) & SCRIPT_MASK_;
case UProperty.HANGUL_SYLLABLE_TYPE:
return HangulSyllableType.COUNT-1;
case UProperty.NFD_QUICK_CHECK:
case UProperty.NFKD_QUICK_CHECK:
return 1; // YES -- these are never "maybe", only "no" or "yes"
case UProperty.NFC_QUICK_CHECK:
case UProperty.NFKC_QUICK_CHECK:
return 2; // MAYBE
case UProperty.GRAPHEME_CLUSTER_BREAK:
return (PROPERTY_.getMaxValues(2) & GCB_MASK) >> GCB_SHIFT;
case UProperty.SENTENCE_BREAK:
return (PROPERTY_.getMaxValues(2) & SB_MASK) >> SB_SHIFT;
case UProperty.WORD_BREAK:
return (PROPERTY_.getMaxValues(2) & WB_MASK) >> WB_SHIFT;
default:
return -1; // undefined
}
}
return -1; // undefined
}
/**
* Provide the java.lang.Character forDigit API, for convenience.
* @stable ICU 3.0
*/
public static char forDigit(int digit, int radix) {
return java.lang.Character.forDigit(digit, radix);
}
// JDK 1.5 API coverage
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#LEAD_SURROGATE_MIN_VALUE
* @stable ICU 3.0
*/
public static final char MIN_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MIN_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#LEAD_SURROGATE_MAX_VALUE
* @stable ICU 3.0
*/
public static final char MAX_HIGH_SURROGATE = UTF16.LEAD_SURROGATE_MAX_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#TRAIL_SURROGATE_MIN_VALUE
* @stable ICU 3.0
*/
public static final char MIN_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MIN_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#TRAIL_SURROGATE_MAX_VALUE
* @stable ICU 3.0
*/
public static final char MAX_LOW_SURROGATE = UTF16.TRAIL_SURROGATE_MAX_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#SURROGATE_MIN_VALUE
* @stable ICU 3.0
*/
public static final char MIN_SURROGATE = UTF16.SURROGATE_MIN_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#SURROGATE_MAX_VALUE
* @stable ICU 3.0
*/
public static final char MAX_SURROGATE = UTF16.SURROGATE_MAX_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#SUPPLEMENTARY_MIN_VALUE
* @stable ICU 3.0
*/
public static final int MIN_SUPPLEMENTARY_CODE_POINT = UTF16.SUPPLEMENTARY_MIN_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#CODEPOINT_MAX_VALUE
* @stable ICU 3.0
*/
public static final int MAX_CODE_POINT = UTF16.CODEPOINT_MAX_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @see UTF16#CODEPOINT_MIN_VALUE
* @stable ICU 3.0
*/
public static final int MIN_CODE_POINT = UTF16.CODEPOINT_MIN_VALUE;
/**
* Cover the JDK 1.5 API, for convenience.
* @param cp the code point to check
* @return true if cp is a valid code point
* @stable ICU 3.0
*/
public static final boolean isValidCodePoint(int cp) {
return cp >= 0 && cp <= MAX_CODE_POINT;
}
/**
* Cover the JDK 1.5 API, for convenience.
* @param cp the code point to check
* @return true if cp is a supplementary code point
* @stable ICU 3.0
*/
public static final boolean isSupplementaryCodePoint(int cp) {
return cp >= UTF16.SUPPLEMENTARY_MIN_VALUE
&& cp <= UTF16.CODEPOINT_MAX_VALUE;
}
/**
* Cover the JDK 1.5 API, for convenience.
* @param ch the char to check
* @return true if ch is a high (lead) surrogate
* @stable ICU 3.0
*/
public static boolean isHighSurrogate(char ch) {
return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE;
}
/**
* Cover the JDK 1.5 API, for convenience.
* @param ch the char to check
* @return true if ch is a low (trail) surrogate
* @stable ICU 3.0
*/
public static boolean isLowSurrogate(char ch) {
return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE;
}
/**
* Cover the JDK 1.5 API, for convenience. Return true if the chars
* form a valid surrogate pair.
* @param high the high (lead) char
* @param low the low (trail) char
* @return true if high, low form a surrogate pair
* @stable ICU 3.0
*/
public static final boolean isSurrogatePair(char high, char low) {
return isHighSurrogate(high) && isLowSurrogate(low);
}
/**
* Cover the JDK 1.5 API, for convenience. Return the number of chars needed
* to represent the code point. This does not check the
* code point for validity.
* @param cp the code point to check
* @return the number of chars needed to represent the code point
* @see UTF16#getCharCount
* @stable ICU 3.0
*/
public static int charCount(int cp) {
return UTF16.getCharCount(cp);
}
/**
* Cover the JDK 1.5 API, for convenience. Return the code point represented by
* the characters. This does not check the surrogate pair for validity.
* @param high the high (lead) surrogate
* @param low the low (trail) surrogate
* @return the code point formed by the surrogate pair
* @stable ICU 3.0
*/
public static final int toCodePoint(char high, char low) {
return UCharacterProperty.getRawSupplementary(high, low);
}
/**
* Cover the JDK 1.5 API, for convenience. Return the code point at index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index and index+1.
* @param seq the characters to check
* @param index the index of the first or only char forming the code point
* @return the code point at the index
* @stable ICU 3.0
*/
//#if defined(FOUNDATION10) || defined(J2SE13)
//## public static final int codePointAt(String seq, int index) {
//## char c1 = seq.charAt(index++);
//## if (isHighSurrogate(c1)) {
//## if (index < seq.length()) {
//## char c2 = seq.charAt(index);
//## if (isLowSurrogate(c2)) {
//## return toCodePoint(c1, c2);
//## }
//## }
//## }
//## return c1;
//## }
//## public static final int codePointAt(StringBuffer seq, int index) {
//## return codePointAt(seq.toString(), index);
//## }
//#else
//#if defined(ECLIPSE_FRAGMENT)
//## public static final int codePointAt(String seq, int index) {
//## return codePointAt((CharSequence)seq, index);
//## }
//## public static final int codePointAt(StringBuffer seq, int index) {
//## return codePointAt((CharSequence)seq, index);
//## }
//#endif
public static final int codePointAt(CharSequence seq, int index) {
char c1 = seq.charAt(index++);
if (isHighSurrogate(c1)) {
if (index < seq.length()) {
char c2 = seq.charAt(index);
if (isLowSurrogate(c2)) {
return toCodePoint(c1, c2);
}
}
}
return c1;
}
//#endif
/**
* Cover the JDK 1.5 API, for convenience. Return the code point at index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index and index+1.
* @param text the characters to check
* @param index the index of the first or only char forming the code point
* @return the code point at the index
* @stable ICU 3.0
*/
public static final int codePointAt(char[] text, int index) {
char c1 = text[index++];
if (isHighSurrogate(c1)) {
if (index < text.length) {
char c2 = text[index];
if (isLowSurrogate(c2)) {
return toCodePoint(c1, c2);
}
}
}
return c1;
}
/**
* Cover the JDK 1.5 API, for convenience. Return the code point at index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index and index+1.
* @param text the characters to check
* @param index the index of the first or only char forming the code point
* @param limit the limit of the valid text
* @return the code point at the index
* @stable ICU 3.0
*/
public static final int codePointAt(char[] text, int index, int limit) {
if (index >= limit || limit > text.length) {
throw new IndexOutOfBoundsException();
}
char c1 = text[index++];
if (isHighSurrogate(c1)) {
if (index < limit) {
char c2 = text[index];
if (isLowSurrogate(c2)) {
return toCodePoint(c1, c2);
}
}
}
return c1;
}
/**
* Cover the JDK 1.5 API, for convenience. Return the code point before index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index-1 and index-2.
* @param seq the characters to check
* @param index the index after the last or only char forming the code point
* @return the code point before the index
* @stable ICU 3.0
*/
//#if defined(FOUNDATION10) || defined(J2SE13)
//## public static final int codePointBefore(String seq, int index) {
//## char c2 = seq.charAt(--index);
//## if (isLowSurrogate(c2)) {
//## if (index > 0) {
//## char c1 = seq.charAt(--index);
//## if (isHighSurrogate(c1)) {
//## return toCodePoint(c1, c2);
//## }
//## }
//## }
//## return c2;
//## }
//## public static final int codePointBefore(StringBuffer seq, int index) {
//## return codePointBefore(seq.toString(), index);
//## }
//#else
//#if defined(ECLIPSE_FRAGMENT)
//## public static final int codePointBefore(String seq, int index) {
//## return codePointBefore((CharSequence)seq, index);
//## }
//## public static final int codePointBefore(StringBuffer seq, int index) {
//## return codePointBefore((CharSequence)seq, index);
//## }
//#endif
public static final int codePointBefore(CharSequence seq, int index) {
char c2 = seq.charAt(--index);
if (isLowSurrogate(c2)) {
if (index > 0) {
char c1 = seq.charAt(--index);
if (isHighSurrogate(c1)) {
return toCodePoint(c1, c2);
}
}
}
return c2;
}
//#endif
/**
* Cover the JDK 1.5 API, for convenience. Return the code point before index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index-1 and index-2.
* @param text the characters to check
* @param index the index after the last or only char forming the code point
* @return the code point before the index
* @stable ICU 3.0
*/
public static final int codePointBefore(char[] text, int index) {
char c2 = text[--index];
if (isLowSurrogate(c2)) {
if (index > 0) {
char c1 = text[--index];
if (isHighSurrogate(c1)) {
return toCodePoint(c1, c2);
}
}
}
return c2;
}
/**
* Cover the JDK 1.5 API, for convenience. Return the code point before index.
*
Note: the semantics of this API is different from the related UTF16
* API. This examines only the characters at index-1 and index-2.
* @param text the characters to check
* @param index the index after the last or only char forming the code point
* @param limit the start of the valid text
* @return the code point before the index
* @stable ICU 3.0
*/
public static final int codePointBefore(char[] text, int index, int limit) {
if (index <= limit || limit < 0) {
throw new IndexOutOfBoundsException();
}
char c2 = text[--index];
if (isLowSurrogate(c2)) {
if (index > limit) {
char c1 = text[--index];
if (isHighSurrogate(c1)) {
return toCodePoint(c1, c2);
}
}
}
return c2;
}
/**
* Cover the JDK 1.5 API, for convenience. Writes the chars representing the
* code point into the destination at the given index.
* @param cp the code point to convert
* @param dst the destination array into which to put the char(s) representing the code point
* @param dstIndex the index at which to put the first (or only) char
* @return the count of the number of chars written (1 or 2)
* @throws IllegalArgumentException if cp is not a valid code point
* @stable ICU 3.0
*/
public static final int toChars(int cp, char[] dst, int dstIndex) {
if (cp >= 0) {
if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
dst[dstIndex] = (char)cp;
return 1;
}
if (cp <= MAX_CODE_POINT) {
dst[dstIndex] = UTF16.getLeadSurrogate(cp);
dst[dstIndex+1] = UTF16.getTrailSurrogate(cp);
return 2;
}
}
throw new IllegalArgumentException();
}
/**
* Cover the JDK 1.5 API, for convenience. Returns a char array
* representing the code point.
* @param cp the code point to convert
* @return an array containing the char(s) representing the code point
* @throws IllegalArgumentException if cp is not a valid code point
* @stable ICU 3.0
*/
public static final char[] toChars(int cp) {
if (cp >= 0) {
if (cp < MIN_SUPPLEMENTARY_CODE_POINT) {
return new char[] { (char)cp };
}
if (cp <= MAX_CODE_POINT) {
return new char[] {
UTF16.getLeadSurrogate(cp),
UTF16.getTrailSurrogate(cp)
};
}
}
throw new IllegalArgumentException();
}
/**
* Cover the JDK API, for convenience. Return a byte representing the directionality of
* the character.
*
Note: Unlike the JDK, this returns DIRECTIONALITY_LEFT_TO_RIGHT for undefined or
* out-of-bounds characters.
Note: The return value must be
* tested using the constants defined in {@link UCharacterEnums.ECharacterDirection}
* since the values are different from the ones defined by java.lang.Character
.
* @param cp the code point to check
* @return the directionality of the code point
* @see #getDirection
* @stable ICU 3.0
*/
public static byte getDirectionality(int cp)
{
return (byte)getDirection(cp);
}
/**
* Cover the JDK API, for convenience. Count the number of code points in the range of text.
* @param text the characters to check
* @param start the start of the range
* @param limit the limit of the range
* @return the number of code points in the range
* @stable ICU 3.0
*/
//#if defined(FOUNDATION10) || defined(J2SE13)
//## public static int codePointCount(String text, int start, int limit) {
//## if (start < 0 || limit < start || limit > text.length()) {
//## throw new IndexOutOfBoundsException("start (" + start +
//## ") or limit (" + limit +
//## ") invalid or out of range 0, " + text.length());
//## }
//##
//## int len = limit - start;
//## while (limit > start) {
//## char ch = text.charAt(--limit);
//## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
//## ch = text.charAt(--limit);
//## if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
//## --len;
//## break;
//## }
//## }
//## }
//## return len;
//## }
//## public static int codePointCount(StringBuffer text, int start, int limit) {
//## return codePointCount(text.toString(), start, limit);
//## }
//#else
//#if defined(ECLIPSE_FRAGMENT)
//## public static int codePointCount(String text, int start, int limit) {
//## return codePointCount((CharSequence)text, start, limit);
//## }
//## public static int codePointCount(StringBuffer text, int start, int limit) {
//## return codePointCount((CharSequence)text, start, limit);
//## }
//#endif
public static int codePointCount(CharSequence text, int start, int limit) {
if (start < 0 || limit < start || limit > text.length()) {
throw new IndexOutOfBoundsException("start (" + start +
") or limit (" + limit +
") invalid or out of range 0, " + text.length());
}
int len = limit - start;
while (limit > start) {
char ch = text.charAt(--limit);
while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
ch = text.charAt(--limit);
if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
--len;
break;
}
}
}
return len;
}
//#endif
/**
* Cover the JDK API, for convenience. Count the number of code points in the range of text.
* @param text the characters to check
* @param start the start of the range
* @param limit the limit of the range
* @return the number of code points in the range
* @stable ICU 3.0
*/
public static int codePointCount(char[] text, int start, int limit) {
if (start < 0 || limit < start || limit > text.length) {
throw new IndexOutOfBoundsException("start (" + start +
") or limit (" + limit +
") invalid or out of range 0, " + text.length);
}
int len = limit - start;
while (limit > start) {
char ch = text[--limit];
while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && limit > start) {
ch = text[--limit];
if (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE) {
--len;
break;
}
}
}
return len;
}
/**
* Cover the JDK API, for convenience. Adjust the char index by a code point offset.
* @param text the characters to check
* @param index the index to adjust
* @param codePointOffset the number of code points by which to offset the index
* @return the adjusted index
* @stable ICU 3.0
*/
//#if defined(FOUNDATION10) || defined(J2SE13)
//## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
//## if (index < 0 || index > text.length()) {
//## throw new IndexOutOfBoundsException("index ( " + index +
//## ") out of range 0, " + text.length());
//## }
//##
//## if (codePointOffset < 0) {
//## while (++codePointOffset <= 0) {
//## char ch = text.charAt(--index);
//## while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
//## ch = text.charAt(--index);
//## if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
//## if (++codePointOffset > 0) {
//## return index+1;
//## }
//## }
//## }
//## }
//## } else {
//## int limit = text.length();
//## while (--codePointOffset >= 0) {
//## char ch = text.charAt(index++);
//## while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
//## ch = text.charAt(index++);
//## if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
//## if (--codePointOffset < 0) {
//## return index-1;
//## }
//## }
//## }
//## }
//## }
//##
//## return index;
//## }
//## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
//## return offsetByCodePoints(text.toString(), index, codePointOffset);
//## }
//#else
//#if defined(ECLIPSE_FRAGMENT)
//## public static int offsetByCodePoints(String text, int index, int codePointOffset) {
//## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
//## }
//## public static int offsetByCodePoints(StringBuffer text, int index, int codePointOffset) {
//## return offsetByCodePoints((CharSequence)text, index, codePointOffset);
//## }
//#endif
public static int offsetByCodePoints(CharSequence text, int index, int codePointOffset) {
if (index < 0 || index > text.length()) {
throw new IndexOutOfBoundsException("index ( " + index +
") out of range 0, " + text.length());
}
if (codePointOffset < 0) {
while (++codePointOffset <= 0) {
char ch = text.charAt(--index);
while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > 0) {
ch = text.charAt(--index);
if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
if (++codePointOffset > 0) {
return index+1;
}
}
}
}
} else {
int limit = text.length();
while (--codePointOffset >= 0) {
char ch = text.charAt(index++);
while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
ch = text.charAt(index++);
if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
if (--codePointOffset < 0) {
return index-1;
}
}
}
}
}
return index;
}
//#endif
/**
* Cover the JDK API, for convenience. Adjust the char index by a code point offset.
* @param text the characters to check
* @param start the start of the range to check
* @param count the length of the range to check
* @param index the index to adjust
* @param codePointOffset the number of code points by which to offset the index
* @return the adjusted index
* @stable ICU 3.0
*/
public static int offsetByCodePoints(char[] text, int start, int count, int index, int codePointOffset) {
int limit = start + count;
if (start < 0 || limit < start || limit > text.length || index < start || index > limit) {
throw new IndexOutOfBoundsException("index ( " + index +
") out of range " + start +
", " + limit +
" in array 0, " + text.length);
}
if (codePointOffset < 0) {
while (++codePointOffset <= 0) {
char ch = text[--index];
if (index < start) {
throw new IndexOutOfBoundsException("index ( " + index +
") < start (" + start +
")");
}
while (ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE && index > start) {
ch = text[--index];
if (ch < MIN_HIGH_SURROGATE || ch > MAX_HIGH_SURROGATE) {
if (++codePointOffset > 0) {
return index+1;
}
}
}
}
} else {
while (--codePointOffset >= 0) {
char ch = text[index++];
if (index > limit) {
throw new IndexOutOfBoundsException("index ( " + index +
") > limit (" + limit +
")");
}
while (ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE && index < limit) {
ch = text[index++];
if (ch < MIN_LOW_SURROGATE || ch > MAX_LOW_SURROGATE) {
if (--codePointOffset < 0) {
return index-1;
}
}
}
}
}
return index;
}
// protected data members --------------------------------------------
/**
* Database storing the sets of character name
*/
static UCharacterName NAME_ = null;
/**
* Singleton object encapsulating the imported pnames.icu property aliases
*/
static UPropertyAliases PNAMES_ = null;
// block to initialise name database and unicode 1.0 data
static {
try {
PNAMES_ = new UPropertyAliases();
NAME_ = UCharacterName.getInstance();
} catch (IOException e) {
// e.printStackTrace();
throw new MissingResourceException(e.getMessage(),"","");
//throw new RuntimeException(e.getMessage());
// DONOT throw an exception
// we might be building ICU modularly wothout names.icu and
// pnames.icu
}
}
// private variables -------------------------------------------------
/**
* Database storing the sets of character property
*/
private static final UCharacterProperty PROPERTY_;
/**
* For optimization
*/
private static final char[] PROPERTY_TRIE_INDEX_;
private static final char[] PROPERTY_TRIE_DATA_;
private static final int PROPERTY_INITIAL_VALUE_;
private static final UCaseProps gCsp;
private static final UBiDiProps gBdp;
// block to initialise character property database
static
{
try
{
PROPERTY_ = UCharacterProperty.getInstance();
PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
}
catch (Exception e)
{
throw new MissingResourceException(e.getMessage(),"","");
}
/*
* In ICU4J 3.2, most Unicode properties were loaded from uprops.icu.
* ICU4J 3.4 adds ucase.icu for case mapping properties and
* ubidi.icu for bidi/shaping properties and
* removes case/bidi/shaping properties from uprops.icu.
*
* Loading of uprops.icu was always done during class loading of UCharacter.class.
* In order to maintain performance for all such properties,
* ucase.icu and ubidi.icu are also loaded during class loading of UCharacter.class.
* It will not fail if they are missing.
* These data items are loaded early to avoid having to synchronize access to them,
* for thread safety and performance.
*
* We try to load these data items at most once.
* If it works, we use the resulting singleton object.
* If it fails, then we get a dummy object, which always works unless
* we are seriously out of memory.
* After UCharacter.class loading, we have a never-changing pointer to either the
* real singleton or the dummy.
*
* This method is used in Unicode properties APIs that
* do not have a service object and also do not have an error code parameter.
* Other API implementations get the singleton themselves
* (synchronized), store it in the service object, and report errors.
*/
UCaseProps csp;
try {
csp=UCaseProps.getSingleton();
} catch(IOException e) {
csp=UCaseProps.getDummy();
}
gCsp=csp;
UBiDiProps bdp;
try {
bdp=UBiDiProps.getSingleton();
} catch(IOException e) {
bdp=UBiDiProps.getDummy();
}
gBdp=bdp;
}
/**
* To get the last character out from a data type
*/
private static final int LAST_CHAR_MASK_ = 0xFFFF;
// /**
// * To get the last byte out from a data type
// */
// private static final int LAST_BYTE_MASK_ = 0xFF;
//
// /**
// * Shift 16 bits
// */
// private static final int SHIFT_16_ = 16;
//
// /**
// * Shift 24 bits
// */
// private static final int SHIFT_24_ = 24;
//
// /**
// * Decimal radix
// */
// private static final int DECIMAL_RADIX_ = 10;
/**
* No break space code point
*/
private static final int NO_BREAK_SPACE_ = 0xA0;
/**
* Figure space code point
*/
private static final int FIGURE_SPACE_ = 0x2007;
/**
* Narrow no break space code point
*/
private static final int NARROW_NO_BREAK_SPACE_ = 0x202F;
/**
* Ideographic number zero code point
*/
private static final int IDEOGRAPHIC_NUMBER_ZERO_ = 0x3007;
/**
* CJK Ideograph, First code point
*/
private static final int CJK_IDEOGRAPH_FIRST_ = 0x4e00;
/**
* CJK Ideograph, Second code point
*/
private static final int CJK_IDEOGRAPH_SECOND_ = 0x4e8c;
/**
* CJK Ideograph, Third code point
*/
private static final int CJK_IDEOGRAPH_THIRD_ = 0x4e09;
/**
* CJK Ideograph, Fourth code point
*/
private static final int CJK_IDEOGRAPH_FOURTH_ = 0x56d8;
/**
* CJK Ideograph, FIFTH code point
*/
private static final int CJK_IDEOGRAPH_FIFTH_ = 0x4e94;
/**
* CJK Ideograph, Sixth code point
*/
private static final int CJK_IDEOGRAPH_SIXTH_ = 0x516d;
/**
* CJK Ideograph, Seventh code point
*/
private static final int CJK_IDEOGRAPH_SEVENTH_ = 0x4e03;
/**
* CJK Ideograph, Eighth code point
*/
private static final int CJK_IDEOGRAPH_EIGHTH_ = 0x516b;
/**
* CJK Ideograph, Nineth code point
*/
private static final int CJK_IDEOGRAPH_NINETH_ = 0x4e5d;
/**
* Application Program command code point
*/
private static final int APPLICATION_PROGRAM_COMMAND_ = 0x009F;
/**
* Unit separator code point
*/
private static final int UNIT_SEPARATOR_ = 0x001F;
/**
* Delete code point
*/
private static final int DELETE_ = 0x007F;
/*
* ISO control character first range upper limit 0x0 - 0x1F
*/
//private static final int ISO_CONTROL_FIRST_RANGE_MAX_ = 0x1F;
/**
* Shift to get numeric type
*/
private static final int NUMERIC_TYPE_SHIFT_ = 5;
/**
* Mask to get numeric type
*/
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
/* encoding of fractional and large numbers */
//private static final int MAX_SMALL_NUMBER=0xff;
private static final int FRACTION_NUM_SHIFT=3; /* numerator: bits 7..3 */
private static final int FRACTION_DEN_MASK=7; /* denominator: bits 2..0 */
//private static final int FRACTION_MAX_NUM=31;
private static final int FRACTION_DEN_OFFSET=2; /* denominator values are 2..9 */
//private static final int FRACTION_MIN_DEN=FRACTION_DEN_OFFSET;
//private static final int FRACTION_MAX_DEN=FRACTION_MIN_DEN+FRACTION_DEN_MASK;
private static final int LARGE_MANT_SHIFT=4; /* mantissa: bits 7..4 */
private static final int LARGE_EXP_MASK=0xf; /* exponent: bits 3..0 */
private static final int LARGE_EXP_OFFSET=2; /* regular exponents 2..17 */
private static final int LARGE_EXP_OFFSET_EXTRA=18; /* extra large exponents 18..33 */
//private static final int LARGE_MIN_EXP=LARGE_EXP_OFFSET;
//private static final int LARGE_MAX_EXP=LARGE_MIN_EXP+LARGE_EXP_MASK;
//private static final int LARGE_MAX_EXP_EXTRA=LARGE_EXP_OFFSET_EXTRA+LARGE_EXP_MASK;
/**
* Han digit characters
*/
private static final int CJK_IDEOGRAPH_COMPLEX_ZERO_ = 0x96f6;
private static final int CJK_IDEOGRAPH_COMPLEX_ONE_ = 0x58f9;
private static final int CJK_IDEOGRAPH_COMPLEX_TWO_ = 0x8cb3;
private static final int CJK_IDEOGRAPH_COMPLEX_THREE_ = 0x53c3;
private static final int CJK_IDEOGRAPH_COMPLEX_FOUR_ = 0x8086;
private static final int CJK_IDEOGRAPH_COMPLEX_FIVE_ = 0x4f0d;
private static final int CJK_IDEOGRAPH_COMPLEX_SIX_ = 0x9678;
private static final int CJK_IDEOGRAPH_COMPLEX_SEVEN_ = 0x67d2;
private static final int CJK_IDEOGRAPH_COMPLEX_EIGHT_ = 0x634c;
private static final int CJK_IDEOGRAPH_COMPLEX_NINE_ = 0x7396;
private static final int CJK_IDEOGRAPH_TEN_ = 0x5341;
private static final int CJK_IDEOGRAPH_COMPLEX_TEN_ = 0x62fe;
private static final int CJK_IDEOGRAPH_HUNDRED_ = 0x767e;
private static final int CJK_IDEOGRAPH_COMPLEX_HUNDRED_ = 0x4f70;
private static final int CJK_IDEOGRAPH_THOUSAND_ = 0x5343;
private static final int CJK_IDEOGRAPH_COMPLEX_THOUSAND_ = 0x4edf;
private static final int CJK_IDEOGRAPH_TEN_THOUSAND_ = 0x824c;
private static final int CJK_IDEOGRAPH_HUNDRED_MILLION_ = 0x5104;
// /**
// * Zero Width Non Joiner.
// * Equivalent to icu4c ZWNJ.
// */
// private static final int ZERO_WIDTH_NON_JOINER_ = 0x200c;
// /**
// * Zero Width Joiner
// * Equivalent to icu4c ZWJ.
// */
// private static final int ZERO_WIDTH_JOINER_ = 0x200d;
/*
* Properties in vector word 2
* Bits
* 31..26 reserved
* 25..20 Line Break
* 19..15 Sentence Break
* 14..10 Word Break
* 9.. 5 Grapheme Cluster Break
* 4.. 0 Decomposition Type
*/
private static final int LB_MASK = 0x03f00000;
private static final int LB_SHIFT = 20;
private static final int LB_VWORD = 2;
private static final int SB_MASK = 0x000f8000;
private static final int SB_SHIFT = 15;
private static final int WB_MASK = 0x00007c00;
private static final int WB_SHIFT = 10;
private static final int GCB_MASK = 0x000003e0;
private static final int GCB_SHIFT = 5;
/**
* Integer properties mask for decomposition type.
* Equivalent to icu4c UPROPS_DT_MASK.
*/
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..20 reserved
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 7.. 0 UScriptCode
*/
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_MASK
*/
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_MASK
*/
private static final int BLOCK_MASK_ = 0x0001ff00;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
*/
private static final int BLOCK_SHIFT_ = 8;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_MASK
*/
private static final int SCRIPT_MASK_ = 0x000000ff;
// private constructor -----------------------------------------------
///CLOVER:OFF
/**
* Private constructor to prevent instantiation
*/
private UCharacter()
{
}
///CLOVER:ON
// private methods ---------------------------------------------------
/**
* Getting the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
private static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
/**
* Gets the numeric type of the property argument
* @param props 32 bit property
* @return the numeric type
*/
private static int getNumericType(int props)
{
return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
}
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* This is a duplicate of UCharacterProperty.getProperty. For optimization
* purposes, this method calls the trie data directly instead of through
* UCharacterProperty.getProperty.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
* @stable ICU 2.6
*/
private static final int getProperty(int ch)
{
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
// BMP codepoint 0000..D7FF or DC00..FFFF
try { // using try for ch < 0 is faster than using an if statement
return PROPERTY_TRIE_DATA_[
(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
+ (ch & 0x1f)];
} catch (ArrayIndexOutOfBoundsException e) {
return PROPERTY_INITIAL_VALUE_;
}
}
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
// lead surrogate D800..DBFF
return PROPERTY_TRIE_DATA_[
(PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
+ (ch & 0x1f)];
}
// for optimization
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
// supplementary code point 10000..10FFFF
// look at the construction of supplementary characters
// trail forms the ends of it.
return PROPERTY_.m_trie_.getSurrogateValue(
UTF16.getLeadSurrogate(ch),
(char)(ch & 0x3ff));
}
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return PROPERTY_INITIAL_VALUE_;
}
}