2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 package com.ibm.icu.lang;
11 * <p>Selection constants for Unicode properties. </p>
12 * <p>These constants are used in functions like
13 * UCharacter.hasBinaryProperty(int) to select one of the Unicode properties.
15 * <p>The properties APIs are intended to reflect Unicode properties as
16 * defined in the Unicode Character Database (UCD) and Unicode Technical
18 * <p>For details about the properties see <a href=http://www.unicode.org>
19 * http://www.unicode.org</a>.</p>
20 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
22 * <p>Important: If ICU is built with UCD files from Unicode versions below
23 * 3.2, then properties marked with "new" are not or not fully
24 * available. Check UCharacter.getUnicodeVersion() to be sure.</p>
25 * @author Syn Wee Quek
27 * @see com.ibm.icu.lang.UCharacter
29 public interface UProperty
31 // public data member --------------------------------------------------
34 * Special value indicating undefined property.
36 * @deprecated This API is ICU internal only.
38 public static final int UNDEFINED = -1;
41 * <p>Binary property Alphabetic. </p>
42 * <p>Property for UCharacter.isUAlphabetic(), different from the property
43 * in UCharacter.isalpha().</p>
44 * <p>Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic.</p>
47 public static final int ALPHABETIC = 0;
50 * First constant for binary Unicode properties.
53 public static final int BINARY_START = ALPHABETIC;
56 * Binary property ASCII_Hex_Digit (0-9 A-F a-f).
59 public static final int ASCII_HEX_DIGIT = 1;
62 * <p>Binary property Bidi_Control.</p>
63 * <p>Format controls which have specific functions in the Bidi Algorithm.
67 public static final int BIDI_CONTROL = 2;
70 * <p>Binary property Bidi_Mirrored.</p>
71 * <p>Characters that may change display in RTL text.</p>
72 * <p>Property for UCharacter.isMirrored().</p>
73 * <p>See Bidi Algorithm; UTR 9.</p>
76 public static final int BIDI_MIRRORED = 3;
79 * <p>Binary property Dash.</p>
80 * <p>Variations of dashes.</p>
83 public static final int DASH = 4;
86 * <p>Binary property Default_Ignorable_Code_Point (new).
88 * <p>Property that indicates codepoint is ignorable in most processing.
90 * <p>Codepoints (2060..206F, FFF0..FFFB, E0000..E0FFF) +
91 * Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)</p>
94 public static final int DEFAULT_IGNORABLE_CODE_POINT = 5;
97 * <p>Binary property Deprecated (new).</p>
98 * <p>The usage of deprecated characters is strongly discouraged.</p>
101 public static final int DEPRECATED = 6;
104 * <p>Binary property Diacritic.</p>
105 * <p>Characters that linguistically modify the meaning of another
106 * character to which they apply.</p>
109 public static final int DIACRITIC = 7;
112 * <p>Binary property Extender.</p>
113 * <p>Extend the value or shape of a preceding alphabetic character, e.g.
114 * length and iteration marks.</p>
117 public static final int EXTENDER = 8;
120 * <p>Binary property Full_Composition_Exclusion.</p>
121 * <p>CompositionExclusions.txt + Singleton Decompositions +
122 * Non-Starter Decompositions.</p>
125 public static final int FULL_COMPOSITION_EXCLUSION = 9;
128 * <p>Binary property Grapheme_Base (new).</p>
129 * <p>For programmatic determination of grapheme cluster boundaries.
130 * [0..10FFFF]-Cc-Cf-Cs-Co-Cn-Zl-Zp-Grapheme_Link-Grapheme_Extend-CGJ</p>
133 public static final int GRAPHEME_BASE = 10;
136 * <p>Binary property Grapheme_Extend (new).</p>
137 * <p>For programmatic determination of grapheme cluster boundaries.</p>
138 * <p>Me+Mn+Mc+Other_Grapheme_Extend-Grapheme_Link-CGJ</p>
141 public static final int GRAPHEME_EXTEND = 11;
144 * <p>Binary property Grapheme_Link (new).</p>
145 * <p>For programmatic determination of grapheme cluster boundaries.</p>
148 public static final int GRAPHEME_LINK = 12;
151 * <p>Binary property Hex_Digit.</p>
152 * <p>Characters commonly used for hexadecimal numbers.</p>
155 public static final int HEX_DIGIT = 13;
158 * <p>Binary property Hyphen.</p>
159 * <p>Dashes used to mark connections between pieces of words, plus the
160 * Katakana middle dot.</p>
163 public static final int HYPHEN = 14;
166 * <p>Binary property ID_Continue.</p>
167 * <p>Characters that can continue an identifier.</p>
168 * <p>ID_Start+Mn+Mc+Nd+Pc</p>
171 public static final int ID_CONTINUE = 15;
174 * <p>Binary property ID_Start.</p>
175 * <p>Characters that can start an identifier.</p>
176 * <p>Lu+Ll+Lt+Lm+Lo+Nl</p>
179 public static final int ID_START = 16;
182 * <p>Binary property Ideographic.</p>
183 * <p>CJKV ideographs.</p>
186 public static final int IDEOGRAPHIC = 17;
189 * <p>Binary property IDS_Binary_Operator (new).</p>
190 * <p>For programmatic determination of Ideographic Description Sequences.
194 public static final int IDS_BINARY_OPERATOR = 18;
197 * <p>Binary property IDS_Trinary_Operator (new).</p>
198 * <p?For programmatic determination of Ideographic Description
202 public static final int IDS_TRINARY_OPERATOR = 19;
205 * <p>Binary property Join_Control.</p>
206 * <p>Format controls for cursive joining and ligation.</p>
209 public static final int JOIN_CONTROL = 20;
212 * <p>Binary property Logical_Order_Exception (new).</p>
213 * <p>Characters that do not use logical order and require special
214 * handling in most processing.</p>
217 public static final int LOGICAL_ORDER_EXCEPTION = 21;
220 * <p>Binary property Lowercase.</p>
221 * <p>Same as UCharacter.isULowercase(), different from
222 * UCharacter.islower().</p>
223 * <p>Ll+Other_Lowercase</p>
226 public static final int LOWERCASE = 22;
228 /** <p>Binary property Math.</p>
229 * <p>Sm+Other_Math</p>
232 public static final int MATH = 23;
235 * <p>Binary property Noncharacter_Code_Point.</p>
236 * <p>Code points that are explicitly defined as illegal for the encoding
240 public static final int NONCHARACTER_CODE_POINT = 24;
243 * <p>Binary property Quotation_Mark.</p>
246 public static final int QUOTATION_MARK = 25;
249 * <p>Binary property Radical (new).</p>
250 * <p>For programmatic determination of Ideographic Description
254 public static final int RADICAL = 26;
257 * <p>Binary property Soft_Dotted (new).</p>
258 * <p>Characters with a "soft dot", like i or j.</p>
259 * <p>An accent placed on these characters causes the dot to disappear.</p>
262 public static final int SOFT_DOTTED = 27;
265 * <p>Binary property Terminal_Punctuation.</p>
266 * <p>Punctuation characters that generally mark the end of textual
270 public static final int TERMINAL_PUNCTUATION = 28;
273 * <p>Binary property Unified_Ideograph (new).</p>
274 * <p>For programmatic determination of Ideographic Description
278 public static final int UNIFIED_IDEOGRAPH = 29;
281 * <p>Binary property Uppercase.</p>
282 * <p>Same as UCharacter.isUUppercase(), different from
283 * UCharacter.isUpperCase().</p>
284 * <p>Lu+Other_Uppercase</p>
287 public static final int UPPERCASE = 30;
290 * <p>Binary property White_Space.</p>
291 * <p>Same as UCharacter.isUWhiteSpace(), different from
292 * UCharacter.isSpace() and UCharacter.isWhitespace().</p>
293 * Space characters+TAB+CR+LF-ZWSP-ZWNBSP</p>
296 public static final int WHITE_SPACE = 31;
299 * <p>Binary property XID_Continue.</p>
300 * <p>ID_Continue modified to allow closure under normalization forms
304 public static final int XID_CONTINUE = 32;
307 * <p>Binary property XID_Start.</p>
308 * <p>ID_Start modified to allow closure under normalization forms NFKC
312 public static final int XID_START = 33;
315 * <p>Binary property Case_Sensitive.</p>
316 * <p>Either the source of a case
317 * mapping or _in_ the target of a case mapping. Not the same as
318 * the general category Cased_Letter.</p>
321 public static final int CASE_SENSITIVE = 34;
324 * Binary property STerm (new in Unicode 4.0.1).
325 * Sentence Terminal. Used in UAX #29: Text Boundaries
326 * (http://www.unicode.org/reports/tr29/)
329 public static final int S_TERM = 35;
332 * Binary property Variation_Selector (new in Unicode 4.0.1).
333 * Indicates all those characters that qualify as Variation Selectors.
334 * For details on the behavior of these characters,
335 * see StandardizedVariants.html and 15.6 Variation Selectors.
338 public static final int VARIATION_SELECTOR = 36;
341 * Binary property NFD_Inert.
342 * ICU-specific property for characters that are inert under NFD,
343 * i.e., they do not interact with adjacent characters.
344 * Used for example in normalizing transforms in incremental mode
345 * to find the boundary of safely normalizable text despite possible
348 * There is one such property per normalization form.
349 * These properties are computed as follows - an inert character is:
350 * a) unassigned, or ALL of the following:
351 * b) of combining class 0.
352 * c) not decomposed by this normalization form.
353 * AND if NFC or NFKC,
354 * d) can never compose with a previous character.
355 * e) can never compose with a following character.
356 * f) can never change if another character is added.
357 * Example: a-breve might satisfy all but f, but if you
358 * add an ogonek it changes to a-ogonek + breve
360 * See also com.ibm.text.UCD.NFSkippable in the ICU4J repository,
361 * and icu/source/common/unormimp.h .
364 public static final int NFD_INERT = 37;
367 * Binary property NFKD_Inert.
368 * ICU-specific property for characters that are inert under NFKD,
369 * i.e., they do not interact with adjacent characters.
370 * Used for example in normalizing transforms in incremental mode
371 * to find the boundary of safely normalizable text despite possible
376 public static final int NFKD_INERT = 38;
379 * Binary property NFC_Inert.
380 * ICU-specific property for characters that are inert under NFC,
381 * i.e., they do not interact with adjacent characters.
382 * Used for example in normalizing transforms in incremental mode
383 * to find the boundary of safely normalizable text despite possible
388 public static final int NFC_INERT = 39;
391 * Binary property NFKC_Inert.
392 * ICU-specific property for characters that are inert under NFKC,
393 * i.e., they do not interact with adjacent characters.
394 * Used for example in normalizing transforms in incremental mode
395 * to find the boundary of safely normalizable text despite possible
400 public static final int NFKC_INERT = 40;
403 * Binary Property Segment_Starter.
404 * ICU-specific property for characters that are starters in terms of
405 * Unicode normalization and combining character sequences.
406 * They have ccc=0 and do not occur in non-initial position of the
407 * canonical decomposition of any character
408 * (like " in NFD(a-umlaut) and a Jamo T in an NFD(Hangul LVT)).
409 * ICU uses this property for segmenting a string for generating a set of
410 * canonically equivalent strings, e.g. for canonical closure while
411 * processing collation tailoring rules.
414 public static final int SEGMENT_STARTER = 41;
417 * Binary property Pattern_Syntax (new in Unicode 4.1).
418 * See UAX #31 Identifier and Pattern Syntax
419 * (http://www.unicode.org/reports/tr31/)
422 public static final int PATTERN_SYNTAX = 42;
425 * Binary property Pattern_White_Space (new in Unicode 4.1).
426 * See UAX #31 Identifier and Pattern Syntax
427 * (http://www.unicode.org/reports/tr31/)
430 public static final int PATTERN_WHITE_SPACE = 43;
433 * Binary property alnum (a C/POSIX character class).
434 * Implemented according to the UTS #18 Annex C Standard Recommendation.
435 * See the UCharacter class documentation.
438 public static final int POSIX_ALNUM = 44;
441 * Binary property blank (a C/POSIX character class).
442 * Implemented according to the UTS #18 Annex C Standard Recommendation.
443 * See the UCharacter class documentation.
446 public static final int POSIX_BLANK = 45;
449 * Binary property graph (a C/POSIX character class).
450 * Implemented according to the UTS #18 Annex C Standard Recommendation.
451 * See the UCharacter class documentation.
454 public static final int POSIX_GRAPH = 46;
457 * Binary property print (a C/POSIX character class).
458 * Implemented according to the UTS #18 Annex C Standard Recommendation.
459 * See the UCharacter class documentation.
462 public static final int POSIX_PRINT = 47;
465 * Binary property xdigit (a C/POSIX character class).
466 * Implemented according to the UTS #18 Annex C Standard Recommendation.
467 * See the UCharacter class documentation.
470 public static final int POSIX_XDIGIT = 48;
473 * Binary property Cased.
474 * For Lowercase, Uppercase and Titlecase characters.
477 public static final int CASED=49;
479 * Binary property Case_Ignorable.
480 * Used in context-sensitive case mappings.
483 public static final int CASE_IGNORABLE=50;
485 * Binary property Changes_When_Lowercased.
488 public static final int CHANGES_WHEN_LOWERCASED=51;
490 * Binary property Changes_When_Uppercased.
493 public static final int CHANGES_WHEN_UPPERCASED=52;
495 * Binary property Changes_When_Titlecased.
498 public static final int CHANGES_WHEN_TITLECASED=53;
500 * Binary property Changes_When_Casefolded.
503 public static final int CHANGES_WHEN_CASEFOLDED=54;
505 * Binary property Changes_When_Casemapped.
508 public static final int CHANGES_WHEN_CASEMAPPED=55;
510 * Binary property Changes_When_NFKC_Casefolded.
513 public static final int CHANGES_WHEN_NFKC_CASEFOLDED=56;
516 * One more than the last constant for binary Unicode properties.
519 public static final int BINARY_LIMIT = 57;
522 * Enumerated property Bidi_Class.
523 * Same as UCharacter.getDirection(int), returns UCharacterDirection values.
526 public static final int BIDI_CLASS = 0x1000;
529 * First constant for enumerated/integer Unicode properties.
532 public static final int INT_START = BIDI_CLASS;
535 * Enumerated property Block.
536 * Same as UCharacter.UnicodeBlock.of(int), returns UCharacter.UnicodeBlock
540 public static final int BLOCK = 0x1001;
543 * Enumerated property Canonical_Combining_Class.
544 * Same as UCharacter.getCombiningClass(int), returns 8-bit numeric values.
547 public static final int CANONICAL_COMBINING_CLASS = 0x1002;
550 * Enumerated property Decomposition_Type.
551 * Returns UCharacter.DecompositionType values.
554 public static final int DECOMPOSITION_TYPE = 0x1003;
557 * Enumerated property East_Asian_Width.
558 * See http://www.unicode.org/reports/tr11/
559 * Returns UCharacter.EastAsianWidth values.
562 public static final int EAST_ASIAN_WIDTH = 0x1004;
565 * Enumerated property General_Category.
566 * Same as UCharacter.getType(int), returns UCharacterCategory values.
569 public static final int GENERAL_CATEGORY = 0x1005;
572 * Enumerated property Joining_Group.
573 * Returns UCharacter.JoiningGroup values.
576 public static final int JOINING_GROUP = 0x1006;
579 * Enumerated property Joining_Type.
580 * Returns UCharacter.JoiningType values.
583 public static final int JOINING_TYPE = 0x1007;
586 * Enumerated property Line_Break.
587 * Returns UCharacter.LineBreak values.
590 public static final int LINE_BREAK = 0x1008;
593 * Enumerated property Numeric_Type.
594 * Returns UCharacter.NumericType values.
597 public static final int NUMERIC_TYPE = 0x1009;
600 * Enumerated property Script.
601 * Same as UScript.getScript(int), returns UScript values.
604 public static final int SCRIPT = 0x100A;
607 * Enumerated property Hangul_Syllable_Type, new in Unicode 4.
608 * Returns UCharacter.HangulSyllableType values.
611 public static final int HANGUL_SYLLABLE_TYPE = 0x100B;
614 * Enumerated property NFD_Quick_Check.
615 * Returns numeric values compatible with Normalizer.QuickCheckResult.
618 public static final int NFD_QUICK_CHECK = 0x100C;
621 * Enumerated property NFKD_Quick_Check.
622 * Returns numeric values compatible with Normalizer.QuickCheckResult.
625 public static final int NFKD_QUICK_CHECK = 0x100D;
628 * Enumerated property NFC_Quick_Check.
629 * Returns numeric values compatible with Normalizer.QuickCheckResult.
632 public static final int NFC_QUICK_CHECK = 0x100E;
635 * Enumerated property NFKC_Quick_Check.
636 * Returns numeric values compatible with Normalizer.QuickCheckResult.
639 public static final int NFKC_QUICK_CHECK = 0x100F;
642 * Enumerated property Lead_Canonical_Combining_Class.
643 * ICU-specific property for the ccc of the first code point
644 * of the decomposition, or lccc(c)=ccc(NFD(c)[0]).
645 * Useful for checking for canonically ordered text;
646 * see Normalizer.FCD and http://www.unicode.org/notes/tn5/#FCD .
647 * Returns 8-bit numeric values like CANONICAL_COMBINING_CLASS.
650 public static final int LEAD_CANONICAL_COMBINING_CLASS = 0x1010;
653 * Enumerated property Trail_Canonical_Combining_Class.
654 * ICU-specific property for the ccc of the last code point
655 * of the decomposition, or lccc(c)=ccc(NFD(c)[last]).
656 * Useful for checking for canonically ordered text;
657 * see Normalizer.FCD and http://www.unicode.org/notes/tn5/#FCD .
658 * Returns 8-bit numeric values like CANONICAL_COMBINING_CLASS.
661 public static final int TRAIL_CANONICAL_COMBINING_CLASS = 0x1011;
664 * Enumerated property Grapheme_Cluster_Break (new in Unicode 4.1).
665 * Used in UAX #29: Text Boundaries
666 * (http://www.unicode.org/reports/tr29/)
667 * Returns UCharacter.GraphemeClusterBreak values.
670 public static final int GRAPHEME_CLUSTER_BREAK = 0x1012;
673 * Enumerated property Sentence_Break (new in Unicode 4.1).
674 * Used in UAX #29: Text Boundaries
675 * (http://www.unicode.org/reports/tr29/)
676 * Returns UCharacter.SentenceBreak values.
679 public static final int SENTENCE_BREAK = 0x1013;
682 * Enumerated property Word_Break (new in Unicode 4.1).
683 * Used in UAX #29: Text Boundaries
684 * (http://www.unicode.org/reports/tr29/)
685 * Returns UCharacter.WordBreak values.
688 public static final int WORD_BREAK = 0x1014;
691 * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
692 * Used in UAX #9: Unicode Bidirectional Algorithm
693 * (http://www.unicode.org/reports/tr9/)
694 * Returns UCharacter.BidiPairedBracketType values.
697 public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
700 * One more than the last constant for enumerated/integer Unicode
704 public static final int INT_LIMIT = 0x1016;
707 * Bitmask property General_Category_Mask.
708 * This is the General_Category property returned as a bit mask.
709 * When used in UCharacter.getIntPropertyValue(c),
710 * returns bit masks for UCharacterCategory values where exactly one bit is set.
711 * When used with UCharacter.getPropertyValueName() and UCharacter.getPropertyValueEnum(),
712 * a multi-bit mask is used for sets of categories like "Letters".
715 public static final int GENERAL_CATEGORY_MASK = 0x2000;
718 * First constant for bit-mask Unicode properties.
721 public static final int MASK_START = GENERAL_CATEGORY_MASK;
724 * One more than the last constant for bit-mask Unicode properties.
727 public static final int MASK_LIMIT = 0x2001;
730 * Double property Numeric_Value.
731 * Corresponds to UCharacter.getUnicodeNumericValue(int).
734 public static final int NUMERIC_VALUE = 0x3000;
737 * First constant for double Unicode properties.
740 public static final int DOUBLE_START = NUMERIC_VALUE;
743 * One more than the last constant for double Unicode properties.
746 public static final int DOUBLE_LIMIT = 0x3001;
749 * String property Age.
750 * Corresponds to UCharacter.getAge(int).
753 public static final int AGE = 0x4000;
756 * First constant for string Unicode properties.
759 public static final int STRING_START = AGE;
762 * String property Bidi_Mirroring_Glyph.
763 * Corresponds to UCharacter.getMirror(int).
766 public static final int BIDI_MIRRORING_GLYPH = 0x4001;
769 * String property Case_Folding.
770 * Corresponds to UCharacter.foldCase(String, boolean).
773 public static final int CASE_FOLDING = 0x4002;
776 * Deprecated string property ISO_Comment.
777 * Corresponds to UCharacter.getISOComment(int).
780 public static final int ISO_COMMENT = 0x4003;
783 * String property Lowercase_Mapping.
784 * Corresponds to UCharacter.toLowerCase(String).
787 public static final int LOWERCASE_MAPPING = 0x4004;
790 * String property Name.
791 * Corresponds to UCharacter.getName(int).
794 public static final int NAME = 0x4005;
797 * String property Simple_Case_Folding.
798 * Corresponds to UCharacter.foldCase(int, boolean).
801 public static final int SIMPLE_CASE_FOLDING = 0x4006;
804 * String property Simple_Lowercase_Mapping.
805 * Corresponds to UCharacter.toLowerCase(int).
808 public static final int SIMPLE_LOWERCASE_MAPPING = 0x4007;
811 * String property Simple_Titlecase_Mapping.
812 * Corresponds to UCharacter.toTitleCase(int).
815 public static final int SIMPLE_TITLECASE_MAPPING = 0x4008;
818 * String property Simple_Uppercase_Mapping.
819 * Corresponds to UCharacter.toUpperCase(int).
822 public static final int SIMPLE_UPPERCASE_MAPPING = 0x4009;
825 * String property Titlecase_Mapping.
826 * Corresponds to UCharacter.toTitleCase(String).
829 public static final int TITLECASE_MAPPING = 0x400A;
832 * String property Unicode_1_Name.
833 * This property is of little practical value.
834 * Beginning with ICU 49, ICU APIs return null or an empty string for this property.
835 * Corresponds to UCharacter.getName1_0(int).
838 public static final int UNICODE_1_NAME = 0x400B;
841 * String property Uppercase_Mapping.
842 * Corresponds to UCharacter.toUpperCase(String).
845 public static final int UPPERCASE_MAPPING = 0x400C;
848 * String property Bidi_Paired_Bracket (new in Unicode 6.3).
849 * Corresponds to UCharacter.getBidiPairedBracket.
852 public static final int BIDI_PAIRED_BRACKET = 0x400D;
855 * One more than the last constant for string Unicode properties.
858 public static final int STRING_LIMIT = 0x400E;
861 * Miscellaneous property Script_Extensions (new in Unicode 6.0).
862 * Some characters are commonly used in multiple scripts.
863 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
864 * Corresponds to UScript.hasScript and UScript.getScriptExtensions.
867 public static final int SCRIPT_EXTENSIONS=0x7000;
869 * First constant for Unicode properties with unusual value types.
872 public static final int OTHER_PROPERTY_START=SCRIPT_EXTENSIONS;
874 * One more than the last constant for Unicode properties with unusual value types.
877 public static final int OTHER_PROPERTY_LIMIT=0x7001;
880 * Selector constants for UCharacter.getPropertyName() and
881 * UCharacter.getPropertyValueName(). These selectors are used to
882 * choose which name is returned for a given property or value.
883 * All properties and values have a long name. Most have a short
884 * name, but some do not. Unicode allows for additional names,
885 * beyond the long and short name, which would be indicated by
886 * LONG + i, where i=1, 2,...
888 * @see UCharacter#getPropertyName
889 * @see UCharacter#getPropertyValueName
892 public interface NameChoice {
894 * Selector for the abbreviated name of a property or value.
895 * Most properties and values have a short name; those that do
899 static final int SHORT = 0;
902 * Selector for the long name of a property or value. All
903 * properties and values have a long name.
906 static final int LONG = 1;
909 * The number of predefined property name choices. Individual
910 * properties or values may have more than COUNT aliases.
913 static final int COUNT = 2;