2 *******************************************************************************
3 * Copyright (C) 2001-2013 International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
8 package com.ibm.icu.lang;
10 import java.util.BitSet;
11 import java.util.Locale;
12 import java.util.MissingResourceException;
14 import com.ibm.icu.impl.ICUResourceBundle;
15 import com.ibm.icu.impl.UCharacterProperty;
16 import com.ibm.icu.util.ULocale;
17 import com.ibm.icu.util.UResourceBundle;
20 * A class to reflect UTR #24: Script Names
21 * (based on ISO 15924:2000, "Code for the representation of names of
22 * scripts"). UTR #24 describes the basis for a new Unicode data file,
26 public final class UScript {
31 public static final int INVALID_CODE = -1;
36 public static final int COMMON = 0; /* Zyyy */
41 public static final int INHERITED = 1; /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */
46 public static final int ARABIC = 2; /* Arab */
51 public static final int ARMENIAN = 3; /* Armn */
56 public static final int BENGALI = 4; /* Beng */
61 public static final int BOPOMOFO = 5; /* Bopo */
66 public static final int CHEROKEE = 6; /* Cher */
71 public static final int COPTIC = 7; /* Qaac */
76 public static final int CYRILLIC = 8; /* Cyrl (Cyrs) */
81 public static final int DESERET = 9; /* Dsrt */
86 public static final int DEVANAGARI = 10; /* Deva */
91 public static final int ETHIOPIC = 11; /* Ethi */
96 public static final int GEORGIAN = 12; /* Geor (Geon; Geoa) */
101 public static final int GOTHIC = 13; /* Goth */
106 public static final int GREEK = 14; /* Grek */
111 public static final int GUJARATI = 15; /* Gujr */
116 public static final int GURMUKHI = 16; /* Guru */
121 public static final int HAN = 17; /* Hani */
126 public static final int HANGUL = 18; /* Hang */
131 public static final int HEBREW = 19; /* Hebr */
136 public static final int HIRAGANA = 20; /* Hira */
141 public static final int KANNADA = 21; /* Knda */
146 public static final int KATAKANA = 22; /* Kana */
151 public static final int KHMER = 23; /* Khmr */
156 public static final int LAO = 24; /* Laoo */
161 public static final int LATIN = 25; /* Latn (Latf; Latg) */
166 public static final int MALAYALAM = 26; /* Mlym */
171 public static final int MONGOLIAN = 27; /* Mong */
176 public static final int MYANMAR = 28; /* Mymr */
181 public static final int OGHAM = 29; /* Ogam */
186 public static final int OLD_ITALIC = 30; /* Ital */
191 public static final int ORIYA = 31; /* Orya */
196 public static final int RUNIC = 32; /* Runr */
201 public static final int SINHALA = 33; /* Sinh */
206 public static final int SYRIAC = 34; /* Syrc (Syrj; Syrn; Syre) */
211 public static final int TAMIL = 35; /* Taml */
216 public static final int TELUGU = 36; /* Telu */
221 public static final int THAANA = 37; /* Thaa */
226 public static final int THAI = 38; /* Thai */
231 public static final int TIBETAN = 39; /* Tibt */
233 * Unified Canadian Aboriginal Symbols
236 public static final int CANADIAN_ABORIGINAL = 40; /* Cans */
238 * Unified Canadian Aboriginal Symbols (alias)
241 public static final int UCAS = CANADIAN_ABORIGINAL; /* Cans */
246 public static final int YI = 41; /* Yiii */
251 public static final int TAGALOG = 42; /* Tglg */
256 public static final int HANUNOO = 43; /* Hano */
261 public static final int BUHID = 44; /* Buhd */
266 public static final int TAGBANWA = 45; /* Tagb */
269 * Script in Unicode 4
273 public static final int BRAILLE = 46; /* Brai */
276 * Script in Unicode 4
280 public static final int CYPRIOT = 47; /* Cprt */
283 * Script in Unicode 4
287 public static final int LIMBU = 48; /* Limb */
290 * Script in Unicode 4
294 public static final int LINEAR_B = 49; /* Linb */
297 * Script in Unicode 4
301 public static final int OSMANYA = 50; /* Osma */
304 * Script in Unicode 4
308 public static final int SHAVIAN = 51; /* Shaw */
311 * Script in Unicode 4
315 public static final int TAI_LE = 52; /* Tale */
318 * Script in Unicode 4
322 public static final int UGARITIC = 53; /* Ugar */
324 * Script in Unicode 4.0.1
327 public static final int KATAKANA_OR_HIRAGANA = 54; /*Hrkt */
330 * Script in Unicode 4.1
333 public static final int BUGINESE = 55; /* Bugi */
335 * Script in Unicode 4.1
338 public static final int GLAGOLITIC = 56; /* Glag */
340 * Script in Unicode 4.1
343 public static final int KHAROSHTHI = 57; /* Khar */
345 * Script in Unicode 4.1
348 public static final int SYLOTI_NAGRI = 58; /* Sylo */
350 * Script in Unicode 4.1
353 public static final int NEW_TAI_LUE = 59; /* Talu */
355 * Script in Unicode 4.1
358 public static final int TIFINAGH = 60; /* Tfng */
360 * Script in Unicode 4.1
363 public static final int OLD_PERSIAN = 61; /* Xpeo */
367 * ISO 15924 script code
370 public static final int BALINESE = 62; /* Bali */
372 * ISO 15924 script code
375 public static final int BATAK = 63; /* Batk */
377 * ISO 15924 script code
380 public static final int BLISSYMBOLS = 64; /* Blis */
382 * ISO 15924 script code
385 public static final int BRAHMI = 65; /* Brah */
387 * ISO 15924 script code
390 public static final int CHAM = 66; /* Cham */
392 * ISO 15924 script code
395 public static final int CIRTH = 67; /* Cirt */
397 * ISO 15924 script code
400 public static final int OLD_CHURCH_SLAVONIC_CYRILLIC = 68; /* Cyrs */
402 * ISO 15924 script code
405 public static final int DEMOTIC_EGYPTIAN = 69; /* Egyd */
407 * ISO 15924 script code
410 public static final int HIERATIC_EGYPTIAN = 70; /* Egyh */
412 * ISO 15924 script code
415 public static final int EGYPTIAN_HIEROGLYPHS = 71; /* Egyp */
417 * ISO 15924 script code
420 public static final int KHUTSURI = 72; /* Geok */
422 * ISO 15924 script code
425 public static final int SIMPLIFIED_HAN = 73; /* Hans */
427 * ISO 15924 script code
430 public static final int TRADITIONAL_HAN = 74; /* Hant */
432 * ISO 15924 script code
435 public static final int PAHAWH_HMONG = 75; /* Hmng */
437 * ISO 15924 script code
440 public static final int OLD_HUNGARIAN = 76; /* Hung */
442 * ISO 15924 script code
445 public static final int HARAPPAN_INDUS = 77; /* Inds */
447 * ISO 15924 script code
450 public static final int JAVANESE = 78; /* Java */
452 * ISO 15924 script code
455 public static final int KAYAH_LI = 79; /* Kali */
457 * ISO 15924 script code
460 public static final int LATIN_FRAKTUR = 80; /* Latf */
462 * ISO 15924 script code
465 public static final int LATIN_GAELIC = 81; /* Latg */
467 * ISO 15924 script code
470 public static final int LEPCHA = 82; /* Lepc */
472 * ISO 15924 script code
475 public static final int LINEAR_A = 83; /* Lina */
477 * ISO 15924 script code
480 public static final int MANDAIC = 84; /* Mand */
482 * ISO 15924 script code
485 public static final int MANDAEAN = MANDAIC;
487 * ISO 15924 script code
490 public static final int MAYAN_HIEROGLYPHS = 85; /* Maya */
492 * ISO 15924 script code
495 public static final int MEROITIC_HIEROGLYPHS = 86; /* Mero */
497 * ISO 15924 script code
500 public static final int MEROITIC = MEROITIC_HIEROGLYPHS;
502 * ISO 15924 script code
505 public static final int NKO = 87; /* Nkoo */
507 * ISO 15924 script code
510 public static final int ORKHON = 88; /* Orkh */
512 * ISO 15924 script code
515 public static final int OLD_PERMIC = 89; /* Perm */
517 * ISO 15924 script code
520 public static final int PHAGS_PA = 90; /* Phag */
522 * ISO 15924 script code
525 public static final int PHOENICIAN = 91; /* Phnx */
527 * ISO 15924 script code
530 public static final int MIAO = 92; /* Plrd */
532 * ISO 15924 script code
535 public static final int PHONETIC_POLLARD = MIAO;
537 * ISO 15924 script code
540 public static final int RONGORONGO = 93; /* Roro */
542 * ISO 15924 script code
545 public static final int SARATI = 94; /* Sara */
547 * ISO 15924 script code
550 public static final int ESTRANGELO_SYRIAC = 95; /* Syre */
552 * ISO 15924 script code
555 public static final int WESTERN_SYRIAC = 96; /* Syrj */
557 * ISO 15924 script code
560 public static final int EASTERN_SYRIAC = 97; /* Syrn */
562 * ISO 15924 script code
565 public static final int TENGWAR = 98; /* Teng */
567 * ISO 15924 script code
570 public static final int VAI = 99; /* Vaii */
572 * ISO 15924 script code
575 public static final int VISIBLE_SPEECH = 100;/* Visp */
577 * ISO 15924 script code
580 public static final int CUNEIFORM = 101;/* Xsux */
582 * ISO 15924 script code
585 public static final int UNWRITTEN_LANGUAGES = 102;/* Zxxx */
587 * ISO 15924 script code
590 public static final int UNKNOWN = 103;/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */
593 * ISO 15924 script code
596 public static final int CARIAN = 104;/* Cari */
598 * ISO 15924 script code
601 public static final int JAPANESE = 105;/* Jpan */
603 * ISO 15924 script code
606 public static final int LANNA = 106;/* Lana */
608 * ISO 15924 script code
611 public static final int LYCIAN = 107;/* Lyci */
613 * ISO 15924 script code
616 public static final int LYDIAN = 108;/* Lydi */
618 * ISO 15924 script code
621 public static final int OL_CHIKI = 109;/* Olck */
623 * ISO 15924 script code
626 public static final int REJANG = 110;/* Rjng */
628 * ISO 15924 script code
631 public static final int SAURASHTRA = 111;/* Saur */
633 * ISO 15924 script code
636 public static final int SIGN_WRITING = 112;/* Sgnw */
638 * ISO 15924 script code
641 public static final int SUNDANESE = 113;/* Sund */
643 * ISO 15924 script code
646 public static final int MOON = 114;/* Moon */
648 * ISO 15924 script code
651 public static final int MEITEI_MAYEK = 115;/* Mtei */
654 * ISO 15924 script code
657 public static final int IMPERIAL_ARAMAIC = 116;/* Armi */
660 * ISO 15924 script code
663 public static final int AVESTAN = 117;/* Avst */
666 * ISO 15924 script code
669 public static final int CHAKMA = 118;/* Cakm */
672 * ISO 15924 script code
675 public static final int KOREAN = 119;/* Kore */
678 * ISO 15924 script code
681 public static final int KAITHI = 120;/* Kthi */
684 * ISO 15924 script code
687 public static final int MANICHAEAN = 121;/* Mani */
690 * ISO 15924 script code
693 public static final int INSCRIPTIONAL_PAHLAVI = 122;/* Phli */
696 * ISO 15924 script code
699 public static final int PSALTER_PAHLAVI = 123;/* Phlp */
702 * ISO 15924 script code
705 public static final int BOOK_PAHLAVI = 124;/* Phlv */
708 * ISO 15924 script code
711 public static final int INSCRIPTIONAL_PARTHIAN = 125;/* Prti */
714 * ISO 15924 script code
717 public static final int SAMARITAN = 126;/* Samr */
720 * ISO 15924 script code
723 public static final int TAI_VIET = 127;/* Tavt */
726 * ISO 15924 script code
729 public static final int MATHEMATICAL_NOTATION = 128;/* Zmth */
732 * ISO 15924 script code
735 public static final int SYMBOLS = 129;/* Zsym */
738 * ISO 15924 script code
741 public static final int BAMUM = 130;/* Bamu */
743 * ISO 15924 script code
746 public static final int LISU = 131;/* Lisu */
748 * ISO 15924 script code
751 public static final int NAKHI_GEBA = 132;/* Nkgb */
753 * ISO 15924 script code
756 public static final int OLD_SOUTH_ARABIAN = 133;/* Sarb */
759 * ISO 15924 script code
762 public static final int BASSA_VAH = 134;/* Bass */
764 * ISO 15924 script code
767 public static final int DUPLOYAN_SHORTAND = 135;/* Dupl */
769 * ISO 15924 script code
772 public static final int ELBASAN = 136;/* Elba */
774 * ISO 15924 script code
777 public static final int GRANTHA = 137;/* Gran */
779 * ISO 15924 script code
782 public static final int KPELLE = 138;/* Kpel */
784 * ISO 15924 script code
787 public static final int LOMA = 139;/* Loma */
789 * ISO 15924 script code
792 public static final int MENDE = 140;/* Mend */
794 * ISO 15924 script code
797 public static final int MEROITIC_CURSIVE = 141;/* Merc */
799 * ISO 15924 script code
802 public static final int OLD_NORTH_ARABIAN = 142;/* Narb */
804 * ISO 15924 script code
807 public static final int NABATAEAN = 143;/* Nbat */
809 * ISO 15924 script code
812 public static final int PALMYRENE = 144;/* Palm */
814 * ISO 15924 script code
817 public static final int SINDHI = 145;/* Sind */
819 * ISO 15924 script code
822 public static final int WARANG_CITI = 146;/* Wara */
825 * ISO 15924 script code
828 public static final int AFAKA = 147;/* Afak */
830 * ISO 15924 script code
833 public static final int JURCHEN = 148;/* Jurc */
835 * ISO 15924 script code
838 public static final int MRO = 149;/* Mroo */
840 * ISO 15924 script code
843 public static final int NUSHU = 150;/* Nshu */
845 * ISO 15924 script code
848 public static final int SHARADA = 151;/* Shrd */
850 * ISO 15924 script code
853 public static final int SORA_SOMPENG = 152;/* Sora */
855 * ISO 15924 script code
858 public static final int TAKRI = 153;/* Takr */
860 * ISO 15924 script code
863 public static final int TANGUT = 154;/* Tang */
865 * ISO 15924 script code
868 public static final int WOLEAI = 155;/* Wole */
871 * ISO 15924 script code
874 public static final int ANATOLIAN_HIEROGLYPHS = 156;/* Hluw */
876 * ISO 15924 script code
879 public static final int KHOJKI = 157;/* Khoj */
881 * ISO 15924 script code
884 public static final int TIRHUTA = 158;/* Tirh */
886 * ISO 15924 script code
889 public static final int CAUCASIAN_ALBANIAN = 159; /* Aghb */
891 * ISO 15924 script code
894 public static final int MAHAJANI = 160; /* Mahj */
896 /* Private use codes from Qaaa - Qabx are not supported */
899 * One higher than the last ISO 15924 script code integer.
900 * This value will increase as ISO 15924 adds script codes
901 * for which integer constants are added above.
904 public static final int CODE_LIMIT = 161;
906 private static final String kLocaleScript = "LocaleScript";
908 //private static final String INVALID_NAME = "Invalid";
910 * Helper function to find the code from locale.
911 * @param locale The locale.
913 private static int[] findCodeFromLocale(ULocale locale) {
914 ICUResourceBundle rb;
917 rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BASE_NAME, locale);
918 } catch (MissingResourceException e) {
919 /* This part seems to never be called since "UResourceBundle.getBundleInstance"
920 * corrects this by setting to ICUResourceBundle.FROM_DEFAULT
921 * when such an invalid locale is passed.
928 rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BASE_NAME, locale);
930 // if rb is not a strict fallback of the requested locale, return null
931 //if(!LocaleUtility.isFallbackOf(rb.getULocale().toString(), locale.toString())){
934 //non existent locale check
935 if(rb.getLoadingStatus()==ICUResourceBundle.FROM_DEFAULT && ! locale.equals(ULocale.getDefault())){
938 UResourceBundle sub = rb.get(kLocaleScript);
940 int[] result = new int[sub.getSize()];
942 for (int i = 0; i < result.length; ++i) {
943 int code = UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
949 if (w < result.length) {
950 throw new IllegalStateException("bad locale data, listed " +
951 result.length + " scripts but found only " + w);
958 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
959 * Returns MALAYAM given "Malayam" OR "Mlym".
960 * Returns LATIN given "en" OR "en_US"
961 * @param locale Locale
962 * @return The script codes array. null if the the code cannot be found.
965 public static final int[] getCode(Locale locale){
966 return findCodeFromLocale(ULocale.forLocale(locale));
969 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
970 * Returns MALAYAM given "Malayam" OR "Mlym".
971 * Returns LATIN given "en" OR "en_US"
972 * @param locale ULocale
973 * @return The script codes array. null if the the code cannot be found.
976 public static final int[] getCode(ULocale locale){
977 return findCodeFromLocale(locale);
980 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
981 * Returns MALAYAM given "Malayam" OR "Mlym".
982 * Returns LATIN given "en" OR "en_US"
984 * <p>Note: To search by short or long script alias only, use
985 * UCharacater.getPropertyValueEnum(UProperty.SCRIPT, alias)
986 * instead. This does a fast lookup with no access of the locale
988 * @param nameOrAbbrOrLocale name of the script or ISO 15924 code or locale
989 * @return The script codes array. null if the the code cannot be found.
992 public static final int[] getCode(String nameOrAbbrOrLocale){
995 UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
998 } catch (IllegalArgumentException e) {
999 return findCodeFromLocale(new ULocale(nameOrAbbrOrLocale));
1004 * Gets a script codes associated with the given ISO 15924 abbreviation or name.
1005 * Returns MALAYAM given "Malayam" OR "Mlym".
1007 * @param nameOrAbbr name of the script or ISO 15924 code
1008 * @return The script code value or INVALID_CODE if the code cannot be found.
1010 * @deprecated This API is ICU internal only.
1012 public static final int getCodeFromName(String nameOrAbbr) {
1014 return UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
1016 } catch (IllegalArgumentException e) {
1017 return INVALID_CODE;
1022 * Gets the script code associated with the given codepoint.
1023 * Returns UScript.MALAYAM given 0x0D02
1024 * @param codepoint UChar32 codepoint
1025 * @return The script code
1028 public static final int getScript(int codepoint){
1029 if (codepoint >= UCharacter.MIN_VALUE & codepoint <= UCharacter.MAX_VALUE) {
1030 int scriptX=UCharacterProperty.INSTANCE.getAdditional(codepoint, 0)&UCharacterProperty.SCRIPT_X_MASK;
1031 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1033 } else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_INHERITED) {
1034 return UScript.COMMON;
1035 } else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1036 return UScript.INHERITED;
1038 return UCharacterProperty.INSTANCE.m_scriptExtensions_[scriptX&UCharacterProperty.SCRIPT_MASK_];
1041 throw new IllegalArgumentException(Integer.toString(codepoint));
1046 * Do the Script_Extensions of code point c contain script sc?
1047 * If c does not have explicit Script_Extensions, then this tests whether
1048 * c has the Script property value sc.
1050 * <p>Some characters are commonly used in multiple scripts.
1051 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
1053 * <p>The Script_Extensions property is provisional. It may be modified or removed
1054 * in future versions of the Unicode Standard, and thus in ICU.
1055 * @param c code point
1056 * @param sc script code
1057 * @return true if sc is in Script_Extensions(c)
1060 public static final boolean hasScript(int c, int sc) {
1061 int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
1062 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1066 char[] scriptExtensions=UCharacterProperty.INSTANCE.m_scriptExtensions_;
1067 int scx=scriptX&UCharacterProperty.SCRIPT_MASK_; // index into scriptExtensions
1068 if(scriptX>=UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1069 scx=scriptExtensions[scx+1];
1072 // Guard against bogus input that would
1073 // make us go past the Script_Extensions terminator.
1076 while(sc>scriptExtensions[scx]) {
1079 return sc==(scriptExtensions[scx]&0x7fff);
1083 * Sets code point c's Script_Extensions as script code integers into the output BitSet.
1085 * <li>If c does have Script_Extensions, then the return value is
1086 * the negative number of Script_Extensions codes (= -set.cardinality());
1087 * in this case, the Script property value
1088 * (normally Common or Inherited) is not included in the set.
1089 * <li>If c does not have Script_Extensions, then the one Script code is put into the set
1090 * and also returned.
1091 * <li>If c is not a valid code point, then the one {@link #UNKNOWN} code is put into the set
1092 * and also returned.
1094 * In other words, if the return value is non-negative, it is c's single Script code
1095 * and the set contains exactly this Script code.
1096 * If the return value is -n, then the set contains c's n>=2 Script_Extensions script codes.
1098 * <p>Some characters are commonly used in multiple scripts.
1099 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
1101 * <p>The Script_Extensions property is provisional. It may be modified or removed
1102 * in future versions of the Unicode Standard, and thus in ICU.
1103 * @param c code point
1104 * @param set set of script code integers; will be cleared, then bits are set
1105 * corresponding to c's Script_Extensions
1106 * @return negative number of script codes in c's Script_Extensions,
1107 * or the non-negative single Script value
1110 public static final int getScriptExtensions(int c, BitSet set) {
1112 int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
1113 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1118 char[] scriptExtensions=UCharacterProperty.INSTANCE.m_scriptExtensions_;
1119 int scx=scriptX&UCharacterProperty.SCRIPT_MASK_; // index into scriptExtensions
1120 if(scriptX>=UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1121 scx=scriptExtensions[scx+1];
1126 sx=scriptExtensions[scx++];
1130 // length==set.cardinality()
1135 * Gets a script name associated with the given script code.
1136 * Returns "Malayam" given MALAYAM
1137 * @param scriptCode int script code
1138 * @return script name as a string in full as given in TR#24
1141 public static final String getName(int scriptCode){
1142 return UCharacter.getPropertyValueName(UProperty.SCRIPT,
1144 UProperty.NameChoice.LONG);
1148 * Gets a script name associated with the given script code.
1149 * Returns "Mlym" given MALAYAM
1150 * @param scriptCode int script code
1151 * @return script abbreviated name as a string as given in TR#24
1154 public static final String getShortName(int scriptCode){
1155 return UCharacter.getPropertyValueName(UProperty.SCRIPT,
1157 UProperty.NameChoice.SHORT);
1161 * Script metadata (script properties).
1162 * See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt
1164 private static final class ScriptMetadata {
1165 // 0 = NOT_ENCODED, no sample character, default false script properties.
1166 // Bits 20.. 0: sample character
1168 // Bits 23..21: usage
1169 private static final int UNKNOWN = 1 << 21;
1170 private static final int EXCLUSION = 2 << 21;
1171 private static final int LIMITED_USE = 3 << 21;
1172 private static final int ASPIRATIONAL = 4 << 21;
1173 private static final int RECOMMENDED = 5 << 21;
1175 // Bits 31..24: Single-bit flags
1176 private static final int RTL = 1 << 24;
1177 private static final int LB_LETTERS = 1 << 25;
1178 private static final int CASED = 1 << 26;
1180 private static final int SCRIPT_PROPS[] = {
1181 // Begin copy-paste output from
1182 // tools/trunk/unicode/py/parsescriptmetadata.py
1183 // or from icu/trunk/source/common/uscript_props.cpp
1184 0x0040 | UNKNOWN, // Zyyy
1185 0x0308 | UNKNOWN, // Zinh
1186 0x0628 | RECOMMENDED | RTL, // Arab
1187 0x0531 | RECOMMENDED | CASED, // Armn
1188 0x0995 | RECOMMENDED, // Beng
1189 0x3105 | RECOMMENDED | LB_LETTERS, // Bopo
1190 0x13C4 | LIMITED_USE, // Cher
1191 0x03E2 | EXCLUSION | CASED, // Copt
1192 0x042F | RECOMMENDED | CASED, // Cyrl
1193 0x10414 | EXCLUSION | CASED, // Dsrt
1194 0x0905 | RECOMMENDED, // Deva
1195 0x12A0 | RECOMMENDED, // Ethi
1196 0x10D3 | RECOMMENDED, // Geor
1197 0x10330 | EXCLUSION, // Goth
1198 0x03A9 | RECOMMENDED | CASED, // Grek
1199 0x0A95 | RECOMMENDED, // Gujr
1200 0x0A15 | RECOMMENDED, // Guru
1201 0x5B57 | RECOMMENDED | LB_LETTERS, // Hani
1202 0xAC00 | RECOMMENDED, // Hang
1203 0x05D0 | RECOMMENDED | RTL, // Hebr
1204 0x304B | RECOMMENDED | LB_LETTERS, // Hira
1205 0x0C95 | RECOMMENDED, // Knda
1206 0x30AB | RECOMMENDED | LB_LETTERS, // Kana
1207 0x1780 | RECOMMENDED | LB_LETTERS, // Khmr
1208 0x0EA5 | RECOMMENDED | LB_LETTERS, // Laoo
1209 0x004C | RECOMMENDED | CASED, // Latn
1210 0x0D15 | RECOMMENDED, // Mlym
1211 0x1826 | ASPIRATIONAL, // Mong
1212 0x1000 | RECOMMENDED | LB_LETTERS, // Mymr
1213 0x168F | EXCLUSION, // Ogam
1214 0x10300 | EXCLUSION, // Ital
1215 0x0B15 | RECOMMENDED, // Orya
1216 0x16A0 | EXCLUSION, // Runr
1217 0x0D85 | RECOMMENDED, // Sinh
1218 0x0710 | LIMITED_USE | RTL, // Syrc
1219 0x0B95 | RECOMMENDED, // Taml
1220 0x0C15 | RECOMMENDED, // Telu
1221 0x078C | RECOMMENDED | RTL, // Thaa
1222 0x0E17 | RECOMMENDED | LB_LETTERS, // Thai
1223 0x0F40 | RECOMMENDED, // Tibt
1224 0x14C0 | ASPIRATIONAL, // Cans
1225 0xA288 | ASPIRATIONAL | LB_LETTERS, // Yiii
1226 0x1703 | EXCLUSION, // Tglg
1227 0x1723 | EXCLUSION, // Hano
1228 0x1743 | EXCLUSION, // Buhd
1229 0x1763 | EXCLUSION, // Tagb
1230 0x2800 | UNKNOWN, // Brai
1231 0x10800 | EXCLUSION | RTL, // Cprt
1232 0x1900 | LIMITED_USE, // Limb
1233 0x10000 | EXCLUSION, // Linb
1234 0x10480 | EXCLUSION, // Osma
1235 0x10450 | EXCLUSION, // Shaw
1236 0x1950 | LIMITED_USE | LB_LETTERS, // Tale
1237 0x10380 | EXCLUSION, // Ugar
1239 0x1A00 | EXCLUSION, // Bugi
1240 0x2C00 | EXCLUSION | CASED, // Glag
1241 0x10A00 | EXCLUSION | RTL, // Khar
1242 0xA800 | LIMITED_USE, // Sylo
1243 0x1980 | LIMITED_USE | LB_LETTERS, // Talu
1244 0x2D30 | ASPIRATIONAL, // Tfng
1245 0x103A0 | EXCLUSION, // Xpeo
1246 0x1B05 | LIMITED_USE | LB_LETTERS, // Bali
1247 0x1BC0 | LIMITED_USE, // Batk
1249 0x11005 | EXCLUSION, // Brah
1250 0xAA00 | LIMITED_USE, // Cham
1255 0x13153 | EXCLUSION, // Egyp
1257 0x5B57 | RECOMMENDED | LB_LETTERS, // Hans
1258 0x5B57 | RECOMMENDED | LB_LETTERS, // Hant
1262 0xA984 | LIMITED_USE | LB_LETTERS, // Java
1263 0xA90A | LIMITED_USE, // Kali
1266 0x1C00 | LIMITED_USE, // Lepc
1268 0x0840 | LIMITED_USE | RTL, // Mand
1270 0x10980 | EXCLUSION | RTL, // Mero
1271 0x07CA | LIMITED_USE | RTL, // Nkoo
1272 0x10C00 | EXCLUSION | RTL, // Orkh
1274 0xA840 | EXCLUSION, // Phag
1275 0x10900 | EXCLUSION | RTL, // Phnx
1276 0x16F00 | ASPIRATIONAL, // Plrd
1283 0xA549 | LIMITED_USE, // Vaii
1285 0x12000 | EXCLUSION, // Xsux
1287 0xFDD0 | UNKNOWN, // Zzzz
1288 0x102A0 | EXCLUSION, // Cari
1289 0x304B | RECOMMENDED | LB_LETTERS, // Jpan
1290 0x1A20 | LIMITED_USE | LB_LETTERS, // Lana
1291 0x10280 | EXCLUSION, // Lyci
1292 0x10920 | EXCLUSION | RTL, // Lydi
1293 0x1C5A | LIMITED_USE, // Olck
1294 0xA930 | EXCLUSION, // Rjng
1295 0xA882 | LIMITED_USE, // Saur
1297 0x1B83 | LIMITED_USE, // Sund
1299 0xABC0 | LIMITED_USE, // Mtei
1300 0x10840 | EXCLUSION | RTL, // Armi
1301 0x10B00 | EXCLUSION | RTL, // Avst
1302 0x11103 | LIMITED_USE, // Cakm
1303 0xAC00 | RECOMMENDED, // Kore
1304 0x11083 | EXCLUSION, // Kthi
1306 0x10B60 | EXCLUSION | RTL, // Phli
1309 0x10B40 | EXCLUSION | RTL, // Prti
1310 0x0800 | EXCLUSION | RTL, // Samr
1311 0xAA80 | LIMITED_USE | LB_LETTERS, // Tavt
1314 0xA6A0 | LIMITED_USE, // Bamu
1315 0xA4D0 | LIMITED_USE, // Lisu
1317 0x10A60 | EXCLUSION | RTL, // Sarb
1325 0x109A0 | EXCLUSION | RTL, // Merc
1335 0x11183 | EXCLUSION, // Shrd
1336 0x110D0 | EXCLUSION, // Sora
1337 0x11680 | EXCLUSION, // Takr
1345 // End copy-paste from parsescriptmetadata.py
1348 private static final int getScriptProps(int script) {
1349 if (0 <= script && script < SCRIPT_PROPS.length) {
1350 return SCRIPT_PROPS[script];
1358 * Script usage constants.
1359 * See UAX #31 Unicode Identifier and Pattern Syntax.
1360 * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers
1363 * @provisional This API might change or be removed in a future release.
1365 public enum ScriptUsage {
1367 * Not encoded in Unicode.
1369 * @provisional This API might change or be removed in a future release.
1373 * Unknown script usage.
1375 * @provisional This API might change or be removed in a future release.
1379 * Candidate for Exclusion from Identifiers.
1381 * @provisional This API might change or be removed in a future release.
1385 * Limited Use script.
1387 * @provisional This API might change or be removed in a future release.
1391 * Aspirational Use script.
1393 * @provisional This API might change or be removed in a future release.
1397 * Recommended script.
1399 * @provisional This API might change or be removed in a future release.
1403 private static final ScriptUsage[] usageValues = ScriptUsage.values();
1406 * Returns the script sample character string.
1407 * This string normally consists of one code point but might be longer.
1408 * The string is empty if the script is not encoded.
1410 * @param script script code
1411 * @return the sample character string
1413 * @provisional This API might change or be removed in a future release.
1415 public static final String getSampleString(int script) {
1416 int sampleChar = ScriptMetadata.getScriptProps(script) & 0x1fffff;
1417 if(sampleChar != 0) {
1418 return new StringBuilder().appendCodePoint(sampleChar).toString();
1424 * Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.
1425 * Returns {@link ScriptUsage#NOT_ENCODED} if the script is not encoded in Unicode.
1427 * @param script script code
1428 * @return script usage
1431 * @provisional This API might change or be removed in a future release.
1433 public static final ScriptUsage getUsage(int script) {
1434 return usageValues[(ScriptMetadata.getScriptProps(script) >> 21) & 7];
1438 * Returns true if the script is written right-to-left.
1439 * For example, Arab and Hebr.
1441 * @param script script code
1442 * @return true if the script is right-to-left
1444 * @provisional This API might change or be removed in a future release.
1446 public static final boolean isRightToLeft(int script) {
1447 return (ScriptMetadata.getScriptProps(script) & ScriptMetadata.RTL) != 0;
1451 * Returns true if the script allows line breaks between letters (excluding hyphenation).
1452 * Such a script typically requires dictionary-based line breaking.
1453 * For example, Hani and Thai.
1455 * @param script script code
1456 * @return true if the script allows line breaks between letters
1458 * @provisional This API might change or be removed in a future release.
1460 public static final boolean breaksBetweenLetters(int script) {
1461 return (ScriptMetadata.getScriptProps(script) & ScriptMetadata.LB_LETTERS) != 0;
1465 * Returns true if in modern (or most recent) usage of the script case distinctions are customary.
1466 * For example, Latn and Cyrl.
1468 * @param script script code
1469 * @return true if the script is cased
1471 * @provisional This API might change or be removed in a future release.
1473 public static final boolean isCased(int script) {
1474 return (ScriptMetadata.getScriptProps(script) & ScriptMetadata.CASED) != 0;
1479 * Private Constructor. Never default construct