2 *******************************************************************************
3 * Copyright (C) 2001-2011 International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 package com.ibm.icu.lang;
10 import java.util.BitSet;
11 import java.util.Locale;
12 import java.util.MissingResourceException;
14 import com.ibm.icu.impl.ICUResourceBundle;
15 import com.ibm.icu.impl.UCharacterProperty;
16 import com.ibm.icu.util.ULocale;
17 import com.ibm.icu.util.UResourceBundle;
20 * A class to reflect UTR #24: Script Names
21 * (based on ISO 15924:2000, "Code for the representation of names of
22 * scripts"). UTR #24 describes the basis for a new Unicode data file,
26 public final class UScript {
31 public static final int INVALID_CODE = -1;
36 public static final int COMMON = 0; /* Zyyy */
41 public static final int INHERITED = 1; /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */
46 public static final int ARABIC = 2; /* Arab */
51 public static final int ARMENIAN = 3; /* Armn */
56 public static final int BENGALI = 4; /* Beng */
61 public static final int BOPOMOFO = 5; /* Bopo */
66 public static final int CHEROKEE = 6; /* Cher */
71 public static final int COPTIC = 7; /* Qaac */
76 public static final int CYRILLIC = 8; /* Cyrl (Cyrs) */
81 public static final int DESERET = 9; /* Dsrt */
86 public static final int DEVANAGARI = 10; /* Deva */
91 public static final int ETHIOPIC = 11; /* Ethi */
96 public static final int GEORGIAN = 12; /* Geor (Geon; Geoa) */
101 public static final int GOTHIC = 13; /* Goth */
106 public static final int GREEK = 14; /* Grek */
111 public static final int GUJARATI = 15; /* Gujr */
116 public static final int GURMUKHI = 16; /* Guru */
121 public static final int HAN = 17; /* Hani */
126 public static final int HANGUL = 18; /* Hang */
131 public static final int HEBREW = 19; /* Hebr */
136 public static final int HIRAGANA = 20; /* Hira */
141 public static final int KANNADA = 21; /* Knda */
146 public static final int KATAKANA = 22; /* Kana */
151 public static final int KHMER = 23; /* Khmr */
156 public static final int LAO = 24; /* Laoo */
161 public static final int LATIN = 25; /* Latn (Latf; Latg) */
166 public static final int MALAYALAM = 26; /* Mlym */
171 public static final int MONGOLIAN = 27; /* Mong */
176 public static final int MYANMAR = 28; /* Mymr */
181 public static final int OGHAM = 29; /* Ogam */
186 public static final int OLD_ITALIC = 30; /* Ital */
191 public static final int ORIYA = 31; /* Orya */
196 public static final int RUNIC = 32; /* Runr */
201 public static final int SINHALA = 33; /* Sinh */
206 public static final int SYRIAC = 34; /* Syrc (Syrj; Syrn; Syre) */
211 public static final int TAMIL = 35; /* Taml */
216 public static final int TELUGU = 36; /* Telu */
221 public static final int THAANA = 37; /* Thaa */
226 public static final int THAI = 38; /* Thai */
231 public static final int TIBETAN = 39; /* Tibt */
233 * Unified Canadian Aboriginal Symbols
236 public static final int CANADIAN_ABORIGINAL = 40; /* Cans */
238 * Unified Canadian Aboriginal Symbols (alias)
241 public static final int UCAS = CANADIAN_ABORIGINAL; /* Cans */
246 public static final int YI = 41; /* Yiii */
251 public static final int TAGALOG = 42; /* Tglg */
256 public static final int HANUNOO = 43; /* Hano */
261 public static final int BUHID = 44; /* Buhd */
266 public static final int TAGBANWA = 45; /* Tagb */
269 * Script in Unicode 4
273 public static final int BRAILLE = 46; /* Brai */
276 * Script in Unicode 4
280 public static final int CYPRIOT = 47; /* Cprt */
283 * Script in Unicode 4
287 public static final int LIMBU = 48; /* Limb */
290 * Script in Unicode 4
294 public static final int LINEAR_B = 49; /* Linb */
297 * Script in Unicode 4
301 public static final int OSMANYA = 50; /* Osma */
304 * Script in Unicode 4
308 public static final int SHAVIAN = 51; /* Shaw */
311 * Script in Unicode 4
315 public static final int TAI_LE = 52; /* Tale */
318 * Script in Unicode 4
322 public static final int UGARITIC = 53; /* Ugar */
324 * Script in Unicode 4.0.1
327 public static final int KATAKANA_OR_HIRAGANA = 54; /*Hrkt */
330 * Script in Unicode 4.1
333 public static final int BUGINESE = 55; /* Bugi */
335 * Script in Unicode 4.1
338 public static final int GLAGOLITIC = 56; /* Glag */
340 * Script in Unicode 4.1
343 public static final int KHAROSHTHI = 57; /* Khar */
345 * Script in Unicode 4.1
348 public static final int SYLOTI_NAGRI = 58; /* Sylo */
350 * Script in Unicode 4.1
353 public static final int NEW_TAI_LUE = 59; /* Talu */
355 * Script in Unicode 4.1
358 public static final int TIFINAGH = 60; /* Tfng */
360 * Script in Unicode 4.1
363 public static final int OLD_PERSIAN = 61; /* Xpeo */
367 * ISO 15924 script code
370 public static final int BALINESE = 62; /* Bali */
372 * ISO 15924 script code
375 public static final int BATAK = 63; /* Batk */
377 * ISO 15924 script code
380 public static final int BLISSYMBOLS = 64; /* Blis */
382 * ISO 15924 script code
385 public static final int BRAHMI = 65; /* Brah */
387 * ISO 15924 script code
390 public static final int CHAM = 66; /* Cham */
392 * ISO 15924 script code
395 public static final int CIRTH = 67; /* Cirt */
397 * ISO 15924 script code
400 public static final int OLD_CHURCH_SLAVONIC_CYRILLIC = 68; /* Cyrs */
402 * ISO 15924 script code
405 public static final int DEMOTIC_EGYPTIAN = 69; /* Egyd */
407 * ISO 15924 script code
410 public static final int HIERATIC_EGYPTIAN = 70; /* Egyh */
412 * ISO 15924 script code
415 public static final int EGYPTIAN_HIEROGLYPHS = 71; /* Egyp */
417 * ISO 15924 script code
420 public static final int KHUTSURI = 72; /* Geok */
422 * ISO 15924 script code
425 public static final int SIMPLIFIED_HAN = 73; /* Hans */
427 * ISO 15924 script code
430 public static final int TRADITIONAL_HAN = 74; /* Hant */
432 * ISO 15924 script code
435 public static final int PAHAWH_HMONG = 75; /* Hmng */
437 * ISO 15924 script code
440 public static final int OLD_HUNGARIAN = 76; /* Hung */
442 * ISO 15924 script code
445 public static final int HARAPPAN_INDUS = 77; /* Inds */
447 * ISO 15924 script code
450 public static final int JAVANESE = 78; /* Java */
452 * ISO 15924 script code
455 public static final int KAYAH_LI = 79; /* Kali */
457 * ISO 15924 script code
460 public static final int LATIN_FRAKTUR = 80; /* Latf */
462 * ISO 15924 script code
465 public static final int LATIN_GAELIC = 81; /* Latg */
467 * ISO 15924 script code
470 public static final int LEPCHA = 82; /* Lepc */
472 * ISO 15924 script code
475 public static final int LINEAR_A = 83; /* Lina */
477 * ISO 15924 script code
480 public static final int MANDAIC = 84; /* Mand */
482 * ISO 15924 script code
485 public static final int MANDAEAN = MANDAIC;
487 * ISO 15924 script code
490 public static final int MAYAN_HIEROGLYPHS = 85; /* Maya */
492 * ISO 15924 script code
495 public static final int MEROITIC_HIEROGLYPHS = 86; /* Mero */
497 * ISO 15924 script code
500 public static final int MEROITIC = MEROITIC_HIEROGLYPHS;
502 * ISO 15924 script code
505 public static final int NKO = 87; /* Nkoo */
507 * ISO 15924 script code
510 public static final int ORKHON = 88; /* Orkh */
512 * ISO 15924 script code
515 public static final int OLD_PERMIC = 89; /* Perm */
517 * ISO 15924 script code
520 public static final int PHAGS_PA = 90; /* Phag */
522 * ISO 15924 script code
525 public static final int PHOENICIAN = 91; /* Phnx */
527 * ISO 15924 script code
530 public static final int PHONETIC_POLLARD = 92; /* Plrd */
532 * ISO 15924 script code
535 public static final int RONGORONGO = 93; /* Roro */
537 * ISO 15924 script code
540 public static final int SARATI = 94; /* Sara */
542 * ISO 15924 script code
545 public static final int ESTRANGELO_SYRIAC = 95; /* Syre */
547 * ISO 15924 script code
550 public static final int WESTERN_SYRIAC = 96; /* Syrj */
552 * ISO 15924 script code
555 public static final int EASTERN_SYRIAC = 97; /* Syrn */
557 * ISO 15924 script code
560 public static final int TENGWAR = 98; /* Teng */
562 * ISO 15924 script code
565 public static final int VAI = 99; /* Vaii */
567 * ISO 15924 script code
570 public static final int VISIBLE_SPEECH = 100;/* Visp */
572 * ISO 15924 script code
575 public static final int CUNEIFORM = 101;/* Xsux */
577 * ISO 15924 script code
580 public static final int UNWRITTEN_LANGUAGES = 102;/* Zxxx */
582 * ISO 15924 script code
585 public static final int UNKNOWN = 103;/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */
587 /* Private use codes from Qaaa - Qabx are not supported*/
589 * ISO 15924 script code
592 public static final int CARIAN = 104;/* Cari */
594 * ISO 15924 script code
597 public static final int JAPANESE = 105;/* Jpan */
599 * ISO 15924 script code
602 public static final int LANNA = 106;/* Lana */
604 * ISO 15924 script code
607 public static final int LYCIAN = 107;/* Lyci */
609 * ISO 15924 script code
612 public static final int LYDIAN = 108;/* Lydi */
614 * ISO 15924 script code
617 public static final int OL_CHIKI = 109;/* Olck */
619 * ISO 15924 script code
622 public static final int REJANG = 110;/* Rjng */
624 * ISO 15924 script code
627 public static final int SAURASHTRA = 111;/* Saur */
629 * ISO 15924 script code
632 public static final int SIGN_WRITING = 112;/* Sgnw */
634 * ISO 15924 script code
637 public static final int SUNDANESE = 113;/* Sund */
639 * ISO 15924 script code
642 public static final int MOON = 114;/* Moon */
644 * ISO 15924 script code
647 public static final int MEITEI_MAYEK = 115;/* Mtei */
650 * ISO 15924 script code
653 public static final int IMPERIAL_ARAMAIC = 116;/* Armi */
656 * ISO 15924 script code
659 public static final int AVESTAN = 117;/* Avst */
662 * ISO 15924 script code
665 public static final int CHAKMA = 118;/* Cakm */
668 * ISO 15924 script code
671 public static final int KOREAN = 119;/* Kore */
674 * ISO 15924 script code
677 public static final int KAITHI = 120;/* Kthi */
680 * ISO 15924 script code
683 public static final int MANICHAEAN = 121;/* Mani */
686 * ISO 15924 script code
689 public static final int INSCRIPTIONAL_PAHLAVI = 122;/* Phli */
692 * ISO 15924 script code
695 public static final int PSALTER_PAHLAVI = 123;/* Phlp */
698 * ISO 15924 script code
701 public static final int BOOK_PAHLAVI = 124;/* Phlv */
704 * ISO 15924 script code
707 public static final int INSCRIPTIONAL_PARTHIAN = 125;/* Prti */
710 * ISO 15924 script code
713 public static final int SAMARITAN = 126;/* Samr */
716 * ISO 15924 script code
719 public static final int TAI_VIET = 127;/* Tavt */
722 * ISO 15924 script code
725 public static final int MATHEMATICAL_NOTATION = 128;/* Zmth */
728 * ISO 15924 script code
731 public static final int SYMBOLS = 129;/* Zsym */
734 * ISO 15924 script code
737 public static final int BAMUM = 130;/* Bamu */
739 * ISO 15924 script code
742 public static final int LISU = 131;/* Lisu */
744 * ISO 15924 script code
747 public static final int NAKHI_GEBA = 132;/* Nkgb */
749 * ISO 15924 script code
752 public static final int OLD_SOUTH_ARABIAN = 133;/* Sarb */
755 * ISO 15924 script code
758 public static final int BASSA_VAH = 134;/* Bass */
760 * ISO 15924 script code
763 public static final int DUPLOYAN_SHORTAND = 135;/* Dupl */
765 * ISO 15924 script code
768 public static final int ELBASAN = 136;/* Elba */
770 * ISO 15924 script code
773 public static final int GRANTHA = 137;/* Gran */
775 * ISO 15924 script code
778 public static final int KPELLE = 138;/* Kpel */
780 * ISO 15924 script code
783 public static final int LOMA = 139;/* Loma */
785 * ISO 15924 script code
788 public static final int MENDE = 140;/* Mend */
790 * ISO 15924 script code
793 public static final int MEROITIC_CURSIVE = 141;/* Merc */
795 * ISO 15924 script code
798 public static final int OLD_NORTH_ARABIAN = 142;/* Narb */
800 * ISO 15924 script code
803 public static final int NABATAEAN = 143;/* Nbat */
805 * ISO 15924 script code
808 public static final int PALMYRENE = 144;/* Palm */
810 * ISO 15924 script code
813 public static final int SINDHI = 145;/* Sind */
815 * ISO 15924 script code
818 public static final int WARANG_CITI = 146;/* Wara */
821 * ISO 15924 script code
824 public static final int AFAKA = 147;/* Afak */
826 * ISO 15924 script code
829 public static final int JURCHEN = 148;/* Jurc */
831 * ISO 15924 script code
834 public static final int MRO = 149;/* Mroo */
836 * ISO 15924 script code
839 public static final int NUSHU = 150;/* Nshu */
841 * ISO 15924 script code
844 public static final int SHARADA = 151;/* Shrd */
846 * ISO 15924 script code
849 public static final int SORA_SOMPENG = 152;/* Sora */
851 * ISO 15924 script code
854 public static final int TAKRI = 153;/* Takr */
856 * ISO 15924 script code
859 public static final int TANGUT = 154;/* Tang */
861 * ISO 15924 script code
864 public static final int WOLEAI = 155;/* Wole */
867 * One higher than the last ISO 15924 script code integer.
868 * This value will increase as ISO 15924 adds script codes
869 * for which integer constants are added above.
872 public static final int CODE_LIMIT = 156;
874 private static final String kLocaleScript = "LocaleScript";
876 //private static final String INVALID_NAME = "Invalid";
878 * Helper function to find the code from locale.
879 * @param locale The locale.
881 private static int[] findCodeFromLocale(ULocale locale) {
882 ICUResourceBundle rb;
885 rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BASE_NAME, locale);
886 } catch (MissingResourceException e) {
887 /* This part seems to never be called since "UResourceBundle.getBundleInstance"
888 * corrects this by setting to ICUResourceBundle.FROM_DEFAULT
889 * when such an invalid locale is passed.
896 rb = (ICUResourceBundle)UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_BASE_NAME, locale);
898 // if rb is not a strict fallback of the requested locale, return null
899 //if(!LocaleUtility.isFallbackOf(rb.getULocale().toString(), locale.toString())){
902 //non existent locale check
903 if(rb.getLoadingStatus()==ICUResourceBundle.FROM_DEFAULT && ! locale.equals(ULocale.getDefault())){
906 UResourceBundle sub = rb.get(kLocaleScript);
908 int[] result = new int[sub.getSize()];
910 for (int i = 0; i < result.length; ++i) {
911 int code = UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
917 if (w < result.length) {
918 throw new IllegalStateException("bad locale data, listed " +
919 result.length + " scripts but found only " + w);
926 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
927 * Returns MALAYAM given "Malayam" OR "Mlym".
928 * Returns LATIN given "en" OR "en_US"
929 * @param locale Locale
930 * @return The script codes array. null if the the code cannot be found.
933 public static final int[] getCode(Locale locale){
934 return findCodeFromLocale(ULocale.forLocale(locale));
937 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
938 * Returns MALAYAM given "Malayam" OR "Mlym".
939 * Returns LATIN given "en" OR "en_US"
940 * @param locale ULocale
941 * @return The script codes array. null if the the code cannot be found.
944 public static final int[] getCode(ULocale locale){
945 return findCodeFromLocale(locale);
948 * Gets a script codes associated with the given locale or ISO 15924 abbreviation or name.
949 * Returns MALAYAM given "Malayam" OR "Mlym".
950 * Returns LATIN given "en" OR "en_US"
952 * <p>Note: To search by short or long script alias only, use
953 * UCharacater.getPropertyValueEnum(UProperty.SCRIPT, alias)
954 * instead. This does a fast lookup with no access of the locale
956 * @param nameOrAbbrOrLocale name of the script or ISO 15924 code or locale
957 * @return The script codes array. null if the the code cannot be found.
960 public static final int[] getCode(String nameOrAbbrOrLocale){
963 UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
966 } catch (IllegalArgumentException e) {
967 return findCodeFromLocale(new ULocale(nameOrAbbrOrLocale));
972 * Gets a script codes associated with the given ISO 15924 abbreviation or name.
973 * Returns MALAYAM given "Malayam" OR "Mlym".
975 * @param nameOrAbbr name of the script or ISO 15924 code
976 * @return The script code value or INVALID_CODE if the code cannot be found.
978 * @deprecated This API is ICU internal only.
980 public static final int getCodeFromName(String nameOrAbbr) {
982 return UCharacter.getPropertyValueEnum(UProperty.SCRIPT,
984 } catch (IllegalArgumentException e) {
990 * Gets the script code associated with the given codepoint.
991 * Returns UScript.MALAYAM given 0x0D02
992 * @param codepoint UChar32 codepoint
993 * @return The script code
996 public static final int getScript(int codepoint){
997 if (codepoint >= UCharacter.MIN_VALUE & codepoint <= UCharacter.MAX_VALUE) {
998 int scriptX=UCharacterProperty.INSTANCE.getAdditional(codepoint, 0)&UCharacterProperty.SCRIPT_X_MASK;
999 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1001 } else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_INHERITED) {
1002 return UScript.COMMON;
1003 } else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1004 return UScript.INHERITED;
1006 return UCharacterProperty.INSTANCE.m_scriptExtensions_[scriptX&UCharacterProperty.SCRIPT_MASK_];
1009 throw new IllegalArgumentException(Integer.toString(codepoint));
1014 * Is code point c used in script sc?
1015 * That is, does code point c have the Script property value sc,
1016 * or do code point c's Script_Extensions include script code sc?
1018 * Some characters are commonly used in multiple scripts.
1019 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
1021 * The Script_Extensions property is provisional. It may be modified or removed
1022 * in future versions of the Unicode Standard, and thus in ICU.
1023 * @param c code point
1024 * @param sc script code
1025 * @return true if Script(c)==sc or sc is in Script_Extensions(c)
1027 * @provisional This API might change or be removed in a future release.
1029 public static final boolean hasScript(int c, int sc) {
1030 int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
1031 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1035 char[] scriptExtensions=UCharacterProperty.INSTANCE.m_scriptExtensions_;
1036 int scx=scriptX&UCharacterProperty.SCRIPT_MASK_; // index into scriptExtensions
1038 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_INHERITED) {
1039 script=UScript.COMMON;
1040 } else if(scriptX<UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1041 script=UScript.INHERITED;
1043 script=scriptExtensions[scx];
1044 scx=scriptExtensions[scx+1];
1049 while(sc>scriptExtensions[scx]) {
1052 return sc==(scriptExtensions[scx]&0x7fff);
1056 * Sets code point c's Script_Extensions as script code integers into the output BitSet.
1058 * Some characters are commonly used in multiple scripts.
1059 * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
1061 * The Script_Extensions property is provisional. It may be modified or removed
1062 * in future versions of the Unicode Standard, and thus in ICU.
1063 * @param c code point
1064 * @param set set of script code integers; will be cleared, then bits are set
1065 * corresponding to c's Script_Extensions
1068 * @provisional This API might change or be removed in a future release.
1070 public static final BitSet getScriptExtensions(int c, BitSet set) {
1072 int scriptX=UCharacterProperty.INSTANCE.getAdditional(c, 0)&UCharacterProperty.SCRIPT_X_MASK;
1073 if(scriptX<UCharacterProperty.SCRIPT_X_WITH_COMMON) {
1077 char[] scriptExtensions=UCharacterProperty.INSTANCE.m_scriptExtensions_;
1078 int scx=scriptX&UCharacterProperty.SCRIPT_MASK_; // index into scriptExtensions
1079 if(scriptX>=UCharacterProperty.SCRIPT_X_WITH_OTHER) {
1080 scx=scriptExtensions[scx+1];
1084 sx=scriptExtensions[scx++];
1091 * Gets a script name associated with the given script code.
1092 * Returns "Malayam" given MALAYAM
1093 * @param scriptCode int script code
1094 * @return script name as a string in full as given in TR#24
1097 public static final String getName(int scriptCode){
1098 return UCharacter.getPropertyValueName(UProperty.SCRIPT,
1100 UProperty.NameChoice.LONG);
1104 * Gets a script name associated with the given script code.
1105 * Returns "Mlym" given MALAYAM
1106 * @param scriptCode int script code
1107 * @return script abbreviated name as a string as given in TR#24
1110 public static final String getShortName(int scriptCode){
1111 return UCharacter.getPropertyValueName(UProperty.SCRIPT,
1113 UProperty.NameChoice.SHORT);
1117 * Private Constructor. Never default construct