2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import java.io.BufferedInputStream;
\r
11 import java.io.InputStream;
\r
12 import java.io.IOException;
\r
13 import java.util.MissingResourceException;
\r
15 import com.ibm.icu.lang.UCharacter;
\r
16 import com.ibm.icu.lang.UCharacterCategory;
\r
17 import com.ibm.icu.lang.UProperty;
\r
18 import com.ibm.icu.text.Normalizer;
\r
19 import com.ibm.icu.text.UnicodeSet;
\r
20 import com.ibm.icu.text.UTF16;
\r
21 import com.ibm.icu.util.RangeValueIterator;
\r
22 import com.ibm.icu.util.VersionInfo;
\r
24 import com.ibm.icu.impl.NormalizerImpl;
\r
27 * <p>Internal class used for Unicode character property database.</p>
\r
28 * <p>This classes store binary data read from uprops.icu.
\r
29 * It does not have the capability to parse the data into more high-level
\r
30 * information. It only returns bytes of information when required.</p>
\r
31 * <p>Due to the form most commonly used for retrieval, array of char is used
\r
32 * to store the binary data.</p>
\r
33 * <p>UCharacterPropertyDB also contains information on accessing indexes to
\r
34 * significant points in the binary data.</p>
\r
35 * <p>Responsibility for molding the binary data into more meaning form lies on
\r
36 * <a href=UCharacter.html>UCharacter</a>.</p>
\r
37 * @author Syn Wee Quek
\r
38 * @since release 2.1, february 1st 2002
\r
41 public final class UCharacterProperty
\r
43 // public data members -----------------------------------------------
\r
48 public CharTrie m_trie_;
\r
51 * CharTrie index array
\r
53 public char[] m_trieIndex_;
\r
56 * CharTrie data array
\r
58 public char[] m_trieData_;
\r
61 * CharTrie data offset
\r
63 public int m_trieInitialValue_;
\r
67 public VersionInfo m_unicodeVersion_;
\r
69 * Latin capital letter i with dot above
\r
71 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
\r
73 * Latin small letter i with dot above
\r
75 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
\r
79 public static final char LATIN_SMALL_LETTER_I_ = 0x69;
\r
81 * Character type mask
\r
83 public static final int TYPE_MASK = 0x1F;
\r
85 // uprops.h enum UPropertySource --------------------------------------- ***
\r
87 /** No source, not a supported property. */
\r
88 public static final int SRC_NONE=0;
\r
89 /** From uchar.c/uprops.icu main trie */
\r
90 public static final int SRC_CHAR=1;
\r
91 /** From uchar.c/uprops.icu properties vectors trie */
\r
92 public static final int SRC_PROPSVEC=2;
\r
93 /** Hangul_Syllable_Type, from uchar.c/uprops.icu */
\r
94 public static final int SRC_HST=3;
\r
95 /** From unames.c/unames.icu */
\r
96 public static final int SRC_NAMES=4;
\r
97 /** From unorm.cpp/unorm.icu */
\r
98 public static final int SRC_NORM=5;
\r
99 /** From ucase.c/ucase.icu */
\r
100 public static final int SRC_CASE=6;
\r
101 /** From ubidi_props.c/ubidi.icu */
\r
102 public static final int SRC_BIDI=7;
\r
103 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
\r
104 public static final int SRC_CHAR_AND_PROPSVEC=8;
\r
105 /** One more than the highest UPropertySource (SRC_) constant. */
\r
106 public static final int SRC_COUNT=9;
\r
108 // public methods ----------------------------------------------------
\r
111 * Java friends implementation
\r
113 public void setIndexData(CharTrie.FriendAgent friendagent)
\r
115 m_trieIndex_ = friendagent.getPrivateIndex();
\r
116 m_trieData_ = friendagent.getPrivateData();
\r
117 m_trieInitialValue_ = friendagent.getPrivateInitialValue();
\r
121 * Gets the property value at the index.
\r
122 * This is optimized.
\r
123 * Note this is alittle different from CharTrie the index m_trieData_
\r
124 * is never negative.
\r
125 * @param ch code point whose property value is to be retrieved
\r
126 * @return property value of code point
\r
128 public final int getProperty(int ch)
\r
130 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
\r
131 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
\r
132 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
\r
133 // BMP codepoint 0000..D7FF or DC00..FFFF
\r
135 try { // using try for ch < 0 is faster than using an if statement
\r
136 return m_trieData_[
\r
137 (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
\r
138 << Trie.INDEX_STAGE_2_SHIFT_)
\r
139 + (ch & Trie.INDEX_STAGE_3_MASK_)];
\r
140 } catch (ArrayIndexOutOfBoundsException e) {
\r
141 return m_trieInitialValue_;
\r
144 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
145 // lead surrogate D800..DBFF
\r
146 return m_trieData_[
\r
147 (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
\r
148 + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
\r
149 << Trie.INDEX_STAGE_2_SHIFT_)
\r
150 + (ch & Trie.INDEX_STAGE_3_MASK_)];
\r
152 if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
\r
153 // supplementary code point 10000..10FFFF
\r
154 // look at the construction of supplementary characters
\r
155 // trail forms the ends of it.
\r
156 return m_trie_.getSurrogateValue(
\r
157 UTF16.getLeadSurrogate(ch),
\r
158 (char)(ch & Trie.SURROGATE_MASK_));
\r
160 // ch is out of bounds
\r
161 // return m_dataOffset_ if there is an error, in this case we return
\r
162 // the default value: m_initialValue_
\r
163 // we cannot assume that m_initialValue_ is at offset 0
\r
164 // this is for optimization.
\r
165 return m_trieInitialValue_;
\r
167 // this all is an inlined form of return m_trie_.getCodePointValue(ch);
\r
171 * Getting the signed numeric value of a character embedded in the property
\r
173 * @param prop the character
\r
174 * @return signed numberic value
\r
176 // public static int getSignedValue(int prop)
\r
178 // return ((short)prop >> VALUE_SHIFT_);
\r
182 * Getting the unsigned numeric value of a character embedded in the property
\r
184 * @param prop the character
\r
185 * @return unsigned numberic value
\r
187 public static int getUnsignedValue(int prop)
\r
189 return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
\r
192 /* internal numeric pseudo-types for special encodings of numeric values */
\r
193 public static final int NT_FRACTION=4; /* ==UCharacter.NumericType.COUNT, must not change unless binary format version changes */
\r
194 public static final int NT_LARGE=5;
\r
195 public static final int NT_COUNT=6;
\r
198 * Gets the unicode additional properties.
\r
199 * C version getUnicodeProperties.
\r
200 * @param codepoint codepoint whose additional properties is to be
\r
203 * @return unicode properties
\r
205 public int getAdditional(int codepoint, int column) {
\r
206 if (column == -1) {
\r
207 return getProperty(codepoint);
\r
209 if (column < 0 || column >= m_additionalColumnsCount_) {
\r
212 return m_additionalVectors_[
\r
213 m_additionalTrie_.getCodePointValue(codepoint) + column];
\r
216 static final int MY_MASK = UCharacterProperty.TYPE_MASK
\r
217 & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
\r
218 (1<<UCharacterCategory.LOWERCASE_LETTER) |
\r
219 (1<<UCharacterCategory.TITLECASE_LETTER) |
\r
220 (1<<UCharacterCategory.MODIFIER_LETTER) |
\r
221 (1<<UCharacterCategory.OTHER_LETTER));
\r
225 * <p>Get the "age" of the code point.</p>
\r
226 * <p>The "age" is the Unicode version when the code point was first
\r
227 * designated (as a non-character or for Private Use) or assigned a
\r
229 * <p>This can be useful to avoid emitting code points to receiving
\r
230 * processes that do not accept newer characters.</p>
\r
231 * <p>The data is from the UCD file DerivedAge.txt.</p>
\r
232 * <p>This API does not check the validity of the codepoint.</p>
\r
233 * @param codepoint The code point.
\r
234 * @return the Unicode version number
\r
236 public VersionInfo getAge(int codepoint)
\r
238 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
\r
239 return VersionInfo.getInstance(
\r
240 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
\r
241 version & LAST_NIBBLE_MASK_, 0, 0);
\r
244 private static final long UNSIGNED_INT_MASK = 0xffffffffL;
\r
246 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
\r
247 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
\r
248 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
\r
249 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
\r
250 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
\r
251 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
\r
252 /** Mask constant for multiple UCharCategory bits (Z Separators). */
\r
253 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
\r
256 * Checks if c is in
\r
257 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
\r
258 * with space=\p{Whitespace} and Control=Cc.
\r
259 * Implements UCHAR_POSIX_GRAPH.
\r
262 private static final boolean isgraphPOSIX(int c) {
\r
263 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
\r
264 /* comparing ==0 returns FALSE for the categories mentioned */
\r
265 return (getMask(UCharacter.getType(c))&
\r
266 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
\r
270 private static final class BinaryProperties{
\r
273 public BinaryProperties(int column,long mask){
\r
274 this.column = column;
\r
278 BinaryProperties[] binProps={
\r
280 * column and mask values for binary properties from u_getUnicodeProperties().
\r
281 * Must be in order of corresponding UProperty,
\r
282 * and there must be exacly one entry per binary UProperty.
\r
284 new BinaryProperties( 1, ( 1 << ALPHABETIC_PROPERTY_) ),
\r
285 new BinaryProperties( 1, ( 1 << ASCII_HEX_DIGIT_PROPERTY_) ),
\r
286 new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_BIDI_CONTROL */
\r
287 new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_BIDI_MIRRORED */
\r
288 new BinaryProperties( 1, ( 1 << DASH_PROPERTY_) ),
\r
289 new BinaryProperties( 1, ( 1 << DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_) ),
\r
290 new BinaryProperties( 1, ( 1 << DEPRECATED_PROPERTY_) ),
\r
291 new BinaryProperties( 1, ( 1 << DIACRITIC_PROPERTY_) ),
\r
292 new BinaryProperties( 1, ( 1 << EXTENDER_PROPERTY_) ),
\r
293 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_FULL_COMPOSITION_EXCLUSION */
\r
294 new BinaryProperties( 1, ( 1 << GRAPHEME_BASE_PROPERTY_) ),
\r
295 new BinaryProperties( 1, ( 1 << GRAPHEME_EXTEND_PROPERTY_) ),
\r
296 new BinaryProperties( 1, ( 1 << GRAPHEME_LINK_PROPERTY_) ),
\r
297 new BinaryProperties( 1, ( 1 << HEX_DIGIT_PROPERTY_) ),
\r
298 new BinaryProperties( 1, ( 1 << HYPHEN_PROPERTY_) ),
\r
299 new BinaryProperties( 1, ( 1 << ID_CONTINUE_PROPERTY_) ),
\r
300 new BinaryProperties( 1, ( 1 << ID_START_PROPERTY_) ),
\r
301 new BinaryProperties( 1, ( 1 << IDEOGRAPHIC_PROPERTY_) ),
\r
302 new BinaryProperties( 1, ( 1 << IDS_BINARY_OPERATOR_PROPERTY_) ),
\r
303 new BinaryProperties( 1, ( 1 << IDS_TRINARY_OPERATOR_PROPERTY_) ),
\r
304 new BinaryProperties( SRC_BIDI, 0 ), /* UCHAR_JOIN_CONTROL */
\r
305 new BinaryProperties( 1, ( 1 << LOGICAL_ORDER_EXCEPTION_PROPERTY_) ),
\r
306 new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_LOWERCASE */
\r
307 new BinaryProperties( 1, ( 1 << MATH_PROPERTY_) ),
\r
308 new BinaryProperties( 1, ( 1 << NONCHARACTER_CODE_POINT_PROPERTY_) ),
\r
309 new BinaryProperties( 1, ( 1 << QUOTATION_MARK_PROPERTY_) ),
\r
310 new BinaryProperties( 1, ( 1 << RADICAL_PROPERTY_) ),
\r
311 new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_SOFT_DOTTED */
\r
312 new BinaryProperties( 1, ( 1 << TERMINAL_PUNCTUATION_PROPERTY_) ),
\r
313 new BinaryProperties( 1, ( 1 << UNIFIED_IDEOGRAPH_PROPERTY_) ),
\r
314 new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_UPPERCASE */
\r
315 new BinaryProperties( 1, ( 1 << WHITE_SPACE_PROPERTY_) ),
\r
316 new BinaryProperties( 1, ( 1 << XID_CONTINUE_PROPERTY_) ),
\r
317 new BinaryProperties( 1, ( 1 << XID_START_PROPERTY_) ),
\r
318 new BinaryProperties( SRC_CASE, 0 ), /* UCHAR_CASE_SENSITIVE */
\r
319 new BinaryProperties( 1, ( 1 << S_TERM_PROPERTY_) ),
\r
320 new BinaryProperties( 1, ( 1 << VARIATION_SELECTOR_PROPERTY_) ),
\r
321 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_NFD_INERT */
\r
322 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_NFKD_INERT */
\r
323 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_NFC_INERT */
\r
324 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_NFKC_INERT */
\r
325 new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_SEGMENT_STARTER */
\r
326 new BinaryProperties( 1, ( 1 << PATTERN_SYNTAX) ),
\r
327 new BinaryProperties( 1, ( 1 << PATTERN_WHITE_SPACE) ),
\r
328 new BinaryProperties( SRC_CHAR_AND_PROPSVEC, 0 ), /* UCHAR_POSIX_ALNUM */
\r
329 new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_BLANK */
\r
330 new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_GRAPH */
\r
331 new BinaryProperties( SRC_CHAR, 0 ), /* UCHAR_POSIX_PRINT */
\r
332 new BinaryProperties( SRC_CHAR, 0 ) /* UCHAR_POSIX_XDIGIT */
\r
337 * <p>Check a binary Unicode property for a code point.</p>
\r
338 * <p>Unicode, especially in version 3.2, defines many more properties
\r
339 * than the original set in UnicodeData.txt.</p>
\r
340 * <p>This API is intended to reflect Unicode properties as defined in
\r
341 * the Unicode Character Database (UCD) and Unicode Technical Reports
\r
343 * <p>For details about the properties see
\r
344 * <a href=http://www.unicode.org/>http://www.unicode.org/</a>.</p>
\r
345 * <p>For names of Unicode properties see the UCD file
\r
346 * PropertyAliases.txt.</p>
\r
347 * <p>This API does not check the validity of the codepoint.</p>
\r
348 * <p>Important: If ICU is built with UCD files from Unicode versions
\r
349 * below 3.2, then properties marked with "new" are not or
\r
350 * not fully available.</p>
\r
351 * @param codepoint Code point to test.
\r
352 * @param property selector constant from com.ibm.icu.lang.UProperty,
\r
353 * identifies which binary property to check.
\r
354 * @return true or false according to the binary Unicode property value
\r
355 * for ch. Also false if property is out of bounds or if the
\r
356 * Unicode version does not have data for the property at all, or
\r
357 * not for this code point.
\r
358 * @see com.ibm.icu.lang.UProperty
\r
361 public boolean hasBinaryProperty(int codepoint, int property)
\r
363 if(property <UProperty.BINARY_START || UProperty.BINARY_LIMIT<=property) {
\r
364 // not a known binary property
\r
367 long mask=binProps[property].mask;
\r
368 int column=binProps[property].column;
\r
370 // systematic, directly stored properties
\r
371 return ((UNSIGNED_INT_MASK & getAdditional(codepoint, column)) & mask)!=0;
\r
373 if(column==SRC_CASE) {
\r
374 /* case mapping properties */
\r
377 csp = UCaseProps.getSingleton();
\r
378 } catch (IOException e) {
\r
382 case UProperty.LOWERCASE:
\r
383 return UCaseProps.LOWER==csp.getType(codepoint);
\r
384 case UProperty.UPPERCASE:
\r
385 return UCaseProps.UPPER==csp.getType(codepoint);
\r
386 case UProperty.SOFT_DOTTED:
\r
387 return csp.isSoftDotted(codepoint);
\r
388 case UProperty.CASE_SENSITIVE:
\r
389 return csp.isCaseSensitive(codepoint);
\r
393 } else if(column==SRC_NORM) {
\r
394 /* normalization properties from unorm.icu */
\r
396 case UProperty.FULL_COMPOSITION_EXCLUSION:
\r
397 return NormalizerImpl.isFullCompositionExclusion(codepoint);
\r
398 case UProperty.NFD_INERT:
\r
399 return Normalizer.isNFSkippable(codepoint, Normalizer.NFD);
\r
400 case UProperty.NFKD_INERT:
\r
401 return Normalizer.isNFSkippable(codepoint, Normalizer.NFKD);
\r
402 case UProperty.NFC_INERT:
\r
403 return Normalizer.isNFSkippable(codepoint, Normalizer.NFC);
\r
404 case UProperty.NFKC_INERT:
\r
405 return Normalizer.isNFSkippable(codepoint, Normalizer.NFKC);
\r
406 case UProperty.SEGMENT_STARTER:
\r
407 return NormalizerImpl.isCanonSafeStart(codepoint);
\r
411 } else if(column==SRC_BIDI) {
\r
412 /* bidi/shaping properties */
\r
415 bdp = UBiDiProps.getSingleton();
\r
416 } catch (IOException e) {
\r
420 case UProperty.BIDI_MIRRORED:
\r
421 return bdp.isMirrored(codepoint);
\r
422 case UProperty.BIDI_CONTROL:
\r
423 return bdp.isBidiControl(codepoint);
\r
424 case UProperty.JOIN_CONTROL:
\r
425 return bdp.isJoinControl(codepoint);
\r
429 } else if(column==SRC_CHAR) {
\r
431 case UProperty.POSIX_BLANK:
\r
432 // "horizontal space"
\r
433 if(codepoint<=0x9f) {
\r
434 return codepoint==9 || codepoint==0x20; /* TAB or SPACE */
\r
437 return UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR;
\r
439 case UProperty.POSIX_GRAPH:
\r
440 return isgraphPOSIX(codepoint);
\r
441 case UProperty.POSIX_PRINT:
\r
443 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
\r
445 * The only cntrl character in graph+blank is TAB (in blank).
\r
446 * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
\r
448 return (UCharacter.getType(codepoint)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(codepoint);
\r
449 case UProperty.POSIX_XDIGIT:
\r
450 /* check ASCII and Fullwidth ASCII a-fA-F */
\r
452 (codepoint<=0x66 && codepoint>=0x41 && (codepoint<=0x46 || codepoint>=0x61)) ||
\r
453 (codepoint>=0xff21 && codepoint<=0xff46 && (codepoint<=0xff26 || codepoint>=0xff41))
\r
458 return UCharacter.getType(codepoint)==UCharacter.DECIMAL_DIGIT_NUMBER;
\r
462 } else if(column==SRC_CHAR_AND_PROPSVEC) {
\r
464 case UProperty.POSIX_ALNUM:
\r
465 return UCharacter.isUAlphabetic(codepoint) || UCharacter.isDigit(codepoint);
\r
475 public final int getSource(int which) {
\r
476 if(which<UProperty.BINARY_START) {
\r
477 return SRC_NONE; /* undefined */
\r
478 } else if(which<UProperty.BINARY_LIMIT) {
\r
479 if(binProps[which].mask!=0) {
\r
480 return SRC_PROPSVEC;
\r
482 return binProps[which].column;
\r
484 } else if(which<UProperty.INT_START) {
\r
485 return SRC_NONE; /* undefined */
\r
486 } else if(which<UProperty.INT_LIMIT) {
\r
488 case UProperty.GENERAL_CATEGORY:
\r
489 case UProperty.NUMERIC_TYPE:
\r
492 case UProperty.HANGUL_SYLLABLE_TYPE:
\r
495 case UProperty.CANONICAL_COMBINING_CLASS:
\r
496 case UProperty.NFD_QUICK_CHECK:
\r
497 case UProperty.NFKD_QUICK_CHECK:
\r
498 case UProperty.NFC_QUICK_CHECK:
\r
499 case UProperty.NFKC_QUICK_CHECK:
\r
500 case UProperty.LEAD_CANONICAL_COMBINING_CLASS:
\r
501 case UProperty.TRAIL_CANONICAL_COMBINING_CLASS:
\r
504 case UProperty.BIDI_CLASS:
\r
505 case UProperty.JOINING_GROUP:
\r
506 case UProperty.JOINING_TYPE:
\r
510 return SRC_PROPSVEC;
\r
512 } else if(which<UProperty.STRING_START) {
\r
514 case UProperty.GENERAL_CATEGORY_MASK:
\r
515 case UProperty.NUMERIC_VALUE:
\r
521 } else if(which<UProperty.STRING_LIMIT) {
\r
523 case UProperty.AGE:
\r
524 return SRC_PROPSVEC;
\r
526 case UProperty.BIDI_MIRRORING_GLYPH:
\r
529 case UProperty.CASE_FOLDING:
\r
530 case UProperty.LOWERCASE_MAPPING:
\r
531 case UProperty.SIMPLE_CASE_FOLDING:
\r
532 case UProperty.SIMPLE_LOWERCASE_MAPPING:
\r
533 case UProperty.SIMPLE_TITLECASE_MAPPING:
\r
534 case UProperty.SIMPLE_UPPERCASE_MAPPING:
\r
535 case UProperty.TITLECASE_MAPPING:
\r
536 case UProperty.UPPERCASE_MAPPING:
\r
539 case UProperty.ISO_COMMENT:
\r
540 case UProperty.NAME:
\r
541 case UProperty.UNICODE_1_NAME:
\r
548 return SRC_NONE; /* undefined */
\r
553 * Forms a supplementary code point from the argument character<br>
\r
554 * Note this is for internal use hence no checks for the validity of the
\r
555 * surrogate characters are done
\r
556 * @param lead lead surrogate character
\r
557 * @param trail trailing surrogate character
\r
558 * @return code point of the supplementary character
\r
560 public static int getRawSupplementary(char lead, char trail)
\r
562 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
\r
566 * Loads the property data and initialize the UCharacterProperty instance.
\r
567 * @throws MissingResourceException when data is missing or data has been corrupted
\r
569 public static UCharacterProperty getInstance()
\r
571 if(INSTANCE_ == null) {
\r
573 INSTANCE_ = new UCharacterProperty();
\r
575 catch (Exception e) {
\r
576 throw new MissingResourceException(e.getMessage(),"","");
\r
584 * Unicode property names and property value names are compared
\r
585 * "loosely". Property[Value]Aliases.txt say:
\r
587 * "With loose matching of property names, the case distinctions,
\r
588 * whitespace, and '_' are ignored."
\r
592 * This function does just that, for ASCII (char *) name strings.
\r
593 * It is almost identical to ucnv_compareNames() but also ignores
\r
594 * ASCII White_Space characters (U+0009..U+000d).
\r
596 * @param name1 name to compare
\r
597 * @param name2 name to compare
\r
598 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
\r
599 * if name1 is greater than name2.
\r
601 /* to be implemented in 2.4
\r
602 * public static int comparePropertyNames(String name1, String name2)
\r
610 // Ignore delimiters '-', '_', and ASCII White_Space
\r
611 if (i1 < name1.length()) {
\r
612 ch1 = name1.charAt(i1 ++);
\r
614 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
\r
615 || ch1 == '\n' // synwee what is || ch1 == '\v'
\r
616 || ch1 == '\f' || ch1=='\r') {
\r
617 if (i1 < name1.length()) {
\r
618 ch1 = name1.charAt(i1 ++);
\r
624 if (i2 < name2.length()) {
\r
625 ch2 = name2.charAt(i2 ++);
\r
627 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
\r
628 || ch2 == '\n' // synwee what is || ch1 == '\v'
\r
629 || ch2 == '\f' || ch2=='\r') {
\r
630 if (i2 < name2.length()) {
\r
631 ch2 = name2.charAt(i2 ++);
\r
638 // If we reach the ends of both strings then they match
\r
639 if (ch1 == 0 && ch2 == 0) {
\r
643 // Case-insensitive comparison
\r
645 result = Character.toLowerCase(ch1)
\r
646 - Character.toLowerCase(ch2);
\r
656 * Checks if the argument c is to be treated as a white space in ICU
\r
657 * rules. Usually ICU rule white spaces are ignored unless quoted.
\r
658 * Equivalent to test for Pattern_White_Space Unicode property.
\r
659 * Stable set of characters, won't change.
\r
660 * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
\r
661 * @param c codepoint to check
\r
662 * @return true if c is a ICU white space
\r
664 public static boolean isRuleWhiteSpace(int c)
\r
666 /* "white space" in the sense of ICU rule parsers
\r
667 This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
\r
668 See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
\r
669 U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
\r
670 Equivalent to test for Pattern_White_Space Unicode property.
\r
672 return (c >= 0x0009 && c <= 0x2029 &&
\r
673 (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
\r
674 c == 0x200E || c == 0x200F || c >= 0x2028));
\r
678 * Get the the maximum values for some enum/int properties.
\r
679 * @return maximum values for the integer properties.
\r
681 public int getMaxValues(int column)
\r
683 // return m_maxBlockScriptValue_;
\r
687 return m_maxBlockScriptValue_;
\r
689 return m_maxJTGValue_;
\r
696 * Gets the type mask
\r
697 * @param type character type
\r
700 public static final int getMask(int type)
\r
705 // protected variables -----------------------------------------------
\r
708 * Extra property trie
\r
710 CharTrie m_additionalTrie_;
\r
712 * Extra property vectors, 1st column for age and second for binary
\r
715 int m_additionalVectors_[];
\r
717 * Number of additional columns
\r
719 int m_additionalColumnsCount_;
\r
721 * Maximum values for block, bits used as in vector word
\r
724 int m_maxBlockScriptValue_;
\r
726 * Maximum values for script, bits used as in vector word
\r
729 int m_maxJTGValue_;
\r
730 // private variables -------------------------------------------------
\r
733 * UnicodeData.txt property object
\r
735 private static UCharacterProperty INSTANCE_ = null;
\r
738 * Default name of the datafile
\r
740 private static final String DATA_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/uprops.icu";
\r
743 * Default buffer size of datafile
\r
745 private static final int DATA_BUFFER_SIZE_ = 25000;
\r
748 * Numeric value shift
\r
750 private static final int VALUE_SHIFT_ = 8;
\r
753 * Mask to be applied after shifting to obtain an unsigned numeric value
\r
755 private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
\r
760 //private static final int NUMERIC_TYPE_SHIFT = 5;
\r
763 * To get the last 5 bits out from a data type
\r
765 //private static final int LAST_5_BIT_MASK_ = 0x1F;
\r
768 * Shift value for lead surrogate to form a supplementary character.
\r
770 private static final int LEAD_SURROGATE_SHIFT_ = 10;
\r
772 * Offset to add to combined surrogate pair to avoid msking.
\r
774 private static final int SURROGATE_OFFSET_ =
\r
775 UTF16.SUPPLEMENTARY_MIN_VALUE -
\r
776 (UTF16.SURROGATE_MIN_VALUE <<
\r
777 LEAD_SURROGATE_SHIFT_) -
\r
778 UTF16.TRAIL_SURROGATE_MIN_VALUE;
\r
781 // additional properties ----------------------------------------------
\r
784 * Additional properties used in internal trie data
\r
787 * Properties in vector word 1
\r
788 * Each bit encodes one binary property.
\r
789 * The following constants represent the bit number, use 1<<UPROPS_XYZ.
\r
790 * UPROPS_BINARY_1_TOP<=32!
\r
792 * Keep this list of property enums in sync with
\r
793 * propListNames[] in icu/source/tools/genprops/props2.c!
\r
795 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
\r
797 private static final int WHITE_SPACE_PROPERTY_ = 0;
\r
798 private static final int DASH_PROPERTY_ = 1;
\r
799 private static final int HYPHEN_PROPERTY_ = 2;
\r
800 private static final int QUOTATION_MARK_PROPERTY_ = 3;
\r
801 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
\r
802 private static final int MATH_PROPERTY_ = 5;
\r
803 private static final int HEX_DIGIT_PROPERTY_ = 6;
\r
804 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
\r
805 private static final int ALPHABETIC_PROPERTY_ = 8;
\r
806 private static final int IDEOGRAPHIC_PROPERTY_ = 9;
\r
807 private static final int DIACRITIC_PROPERTY_ = 10;
\r
808 private static final int EXTENDER_PROPERTY_ = 11;
\r
809 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
\r
810 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
\r
811 private static final int GRAPHEME_LINK_PROPERTY_ = 14;
\r
812 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
\r
813 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
\r
814 private static final int RADICAL_PROPERTY_ = 17;
\r
815 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
\r
816 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
\r
817 private static final int DEPRECATED_PROPERTY_ = 20;
\r
818 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
\r
819 private static final int XID_START_PROPERTY_ = 22;
\r
820 private static final int XID_CONTINUE_PROPERTY_ = 23;
\r
821 private static final int ID_START_PROPERTY_ = 24;
\r
822 private static final int ID_CONTINUE_PROPERTY_ = 25;
\r
823 private static final int GRAPHEME_BASE_PROPERTY_ = 26;
\r
824 private static final int S_TERM_PROPERTY_ = 27;
\r
825 private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
\r
826 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
\r
827 private static final int PATTERN_WHITE_SPACE = 30;
\r
830 * First nibble shift
\r
832 private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
\r
834 * Second nibble mask
\r
836 private static final int LAST_NIBBLE_MASK_ = 0xF;
\r
840 private static final int AGE_SHIFT_ = 24;
\r
843 // private constructors --------------------------------------------------
\r
847 * @exception IOException thrown when data reading fails or data corrupted
\r
849 private UCharacterProperty() throws IOException
\r
852 InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
\r
853 BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
\r
854 UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
\r
858 m_trie_.putIndexData(this);
\r
861 // private methods -------------------------------------------------------
\r
864 * Compare additional properties to see if it has argument type
\r
865 * @param property 32 bit properties
\r
866 * @param type character type
\r
867 * @return true if property has type
\r
869 /*private boolean compareAdditionalType(int property, int type)
\r
871 return (property & (1 << type)) != 0;
\r
874 // property starts for UnicodeSet -------------------------------------- ***
\r
876 private static final int TAB = 0x0009;
\r
877 //private static final int LF = 0x000a;
\r
878 //private static final int FF = 0x000c;
\r
879 private static final int CR = 0x000d;
\r
880 private static final int U_A = 0x0041;
\r
881 private static final int U_F = 0x0046;
\r
882 private static final int U_Z = 0x005a;
\r
883 private static final int U_a = 0x0061;
\r
884 private static final int U_f = 0x0066;
\r
885 private static final int U_z = 0x007a;
\r
886 private static final int DEL = 0x007f;
\r
887 private static final int NL = 0x0085;
\r
888 private static final int NBSP = 0x00a0;
\r
889 private static final int CGJ = 0x034f;
\r
890 private static final int FIGURESP= 0x2007;
\r
891 private static final int HAIRSP = 0x200a;
\r
892 //private static final int ZWNJ = 0x200c;
\r
893 //private static final int ZWJ = 0x200d;
\r
894 private static final int RLM = 0x200f;
\r
895 private static final int NNBSP = 0x202f;
\r
896 private static final int WJ = 0x2060;
\r
897 private static final int INHSWAP = 0x206a;
\r
898 private static final int NOMDIG = 0x206f;
\r
899 private static final int U_FW_A = 0xff21;
\r
900 private static final int U_FW_F = 0xff26;
\r
901 private static final int U_FW_Z = 0xff3a;
\r
902 private static final int U_FW_a = 0xff41;
\r
903 private static final int U_FW_f = 0xff46;
\r
904 private static final int U_FW_z = 0xff5a;
\r
905 private static final int ZWNBSP = 0xfeff;
\r
907 /* for Hangul_Syllable_Type */
\r
908 public void uhst_addPropertyStarts(UnicodeSet set) {
\r
909 /* add code points with hardcoded properties, plus the ones following them */
\r
912 * Add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE.
\r
913 * First, we add fixed boundaries for the blocks of Jamos.
\r
914 * Then we check in loops to see where the current Unicode version
\r
915 * actually stops assigning such Jamos. We start each loop
\r
916 * at the end of the per-Jamo-block assignments in Unicode 4 or earlier.
\r
917 * (These have not changed since Unicode 2.)
\r
919 int c, value, value2;
\r
922 value=UCharacter.HangulSyllableType.LEADING_JAMO;
\r
923 for(c=0x115a; c<=0x115f; ++c) {
\r
924 value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
\r
925 if(value!=value2) {
\r
932 value=UCharacter.HangulSyllableType.VOWEL_JAMO;
\r
933 for(c=0x11a3; c<=0x11a7; ++c) {
\r
934 value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
\r
935 if(value!=value2) {
\r
942 value=UCharacter.HangulSyllableType.TRAILING_JAMO;
\r
943 for(c=0x11fa; c<=0x11ff; ++c) {
\r
944 value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
\r
945 if(value!=value2) {
\r
951 /* Add Hangul type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE. */
\r
952 for(c=NormalizerImpl.HANGUL_BASE; c<(NormalizerImpl.HANGUL_BASE+NormalizerImpl.HANGUL_COUNT); c+=NormalizerImpl.JAMO_T_COUNT) {
\r
959 public UnicodeSet addPropertyStarts(UnicodeSet set) {
\r
960 /* add the start code point of each same-value range of the main trie */
\r
961 TrieIterator propsIter = new TrieIterator(m_trie_);
\r
962 RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
\r
963 while(propsIter.next(propsResult)){
\r
964 set.add(propsResult.start);
\r
967 /* add code points with hardcoded properties, plus the ones following them */
\r
969 /* add for u_isblank() */
\r
973 /* add for IS_THAT_CONTROL_SPACE() */
\r
974 set.add(CR+1); /* range TAB..CR */
\r
980 /* add for u_isIDIgnorable() what was not added above */
\r
981 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
\r
989 /* add no-break spaces for u_isWhitespace() what was not added above */
\r
993 set.add(FIGURESP+1);
\r
997 /* add for u_charDigitValue() */
\r
998 // TODO remove when UCharacter.getHanNumericValue() is changed to just return
\r
999 // Unicode numeric values
\r
1021 /* add for u_digit() */
\r
1027 set.add(U_FW_z+1);
\r
1029 set.add(U_FW_Z+1);
\r
1031 /* add for u_isxdigit() */
\r
1034 set.add(U_FW_f+1);
\r
1035 set.add(U_FW_F+1);
\r
1037 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
\r
1038 set.add(WJ); /* range WJ..NOMDIG */
\r
1040 set.add(0xfffb+1);
\r
1042 set.add(0xe0fff+1);
\r
1044 /* add for UCHAR_GRAPHEME_BASE and others */
\r
1048 return set; // for chaining
\r
1051 public void upropsvec_addPropertyStarts(UnicodeSet set) {
\r
1052 /* add the start code point of each same-value range of the properties vectors trie */
\r
1053 if(m_additionalColumnsCount_>0) {
\r
1054 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
\r
1055 TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
\r
1056 RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
\r
1057 while(propsVectorsIter.next(propsVectorsResult)){
\r
1058 set.add(propsVectorsResult.start);
\r
1063 /*----------------------------------------------------------------
\r
1065 *----------------------------------------------------------------*/
\r
1068 * Return a set of characters for property enumeration.
\r
1069 * The set implicitly contains 0x110000 as well, which is one more than the highest
\r
1070 * Unicode code point.
\r
1072 * This set is used as an ordered list - its code points are ordered, and
\r
1073 * consecutive code points (in Unicode code point order) in the set define a range.
\r
1074 * For each two consecutive characters (start, limit) in the set,
\r
1075 * all of the UCD/normalization and related properties for
\r
1076 * all code points start..limit-1 are all the same,
\r
1077 * except for character names and ISO comments.
\r
1079 * All Unicode code points U+0000..U+10ffff are covered by these ranges.
\r
1080 * The ranges define a partition of the Unicode code space.
\r
1081 * ICU uses the inclusions set to enumerate properties for generating
\r
1082 * UnicodeSets containing all code points that have a certain property value.
\r
1084 * The Inclusion List is generated from the UCD. It is generated
\r
1085 * by enumerating the data tries, and code points for hardcoded properties
\r
1086 * are added as well.
\r
1088 * --------------------------------------------------------------------------
\r
1090 * The following are ideas for getting properties-unique code point ranges,
\r
1091 * with possible optimizations beyond the current implementation.
\r
1092 * These optimizations would require more code and be more fragile.
\r
1093 * The current implementation generates one single list (set) for all properties.
\r
1095 * To enumerate properties efficiently, one needs to know ranges of
\r
1096 * repetitive values, so that the value of only each start code point
\r
1097 * can be applied to the whole range.
\r
1098 * This information is in principle available in the uprops.icu/unorm.icu data.
\r
1100 * There are two obstacles:
\r
1102 * 1. Some properties are computed from multiple data structures,
\r
1103 * making it necessary to get repetitive ranges by intersecting
\r
1104 * ranges from multiple tries.
\r
1106 * 2. It is not economical to write code for getting repetitive ranges
\r
1107 * that are precise for each of some 50 properties.
\r
1109 * Compromise ideas:
\r
1111 * - Get ranges per trie, not per individual property.
\r
1112 * Each range contains the same values for a whole group of properties.
\r
1113 * This would generate currently five range sets, two for uprops.icu tries
\r
1114 * and three for unorm.icu tries.
\r
1116 * - Combine sets of ranges for multiple tries to get sufficient sets
\r
1117 * for properties, e.g., the uprops.icu main and auxiliary tries
\r
1118 * for all non-normalization properties.
\r
1120 * Ideas for representing ranges and combining them:
\r
1122 * - A UnicodeSet could hold just the start code points of ranges.
\r
1123 * Multiple sets are easily combined by or-ing them together.
\r
1125 * - Alternatively, a UnicodeSet could hold each even-numbered range.
\r
1126 * All ranges could be enumerated by using each start code point
\r
1127 * (for the even-numbered ranges) as well as each limit (end+1) code point
\r
1128 * (for the odd-numbered ranges).
\r
1129 * It should be possible to combine two such sets by xor-ing them,
\r
1130 * but no more than two.
\r
1132 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
\r
1133 * but the first one is certainly simpler and applicable for combining more than
\r
1136 * It is possible to combine all range sets for all uprops/unorm tries into one
\r
1137 * set that can be used for all properties.
\r
1138 * As an optimization, there could be less-combined range sets for certain
\r
1139 * groups of properties.
\r
1140 * The relationship of which less-combined range set to use for which property
\r
1141 * depends on the implementation of the properties and must be hardcoded
\r
1142 * - somewhat error-prone and higher maintenance but can be tested easily
\r
1143 * by building property sets "the simple way" in test code.
\r
1147 * Do not use a UnicodeSet pattern because that causes infinite recursion;
\r
1148 * UnicodeSet depends on the inclusions set.
\r
1152 * getInclusions() is commented out starting 2005-feb-12 because
\r
1153 * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
\r
1154 * and only for the relevant property source.
\r
1157 public UnicodeSet getInclusions() {
\r
1158 UnicodeSet set = new UnicodeSet();
\r
1159 NormalizerImpl.addPropertyStarts(set);
\r
1160 addPropertyStarts(set);
\r