2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.impl;
9 import java.io.BufferedInputStream;
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.util.Locale;
13 import java.util.MissingResourceException;
15 import com.ibm.icu.lang.UCharacter;
16 import com.ibm.icu.lang.UCharacterCategory;
17 import com.ibm.icu.text.UTF16;
18 import com.ibm.icu.text.UnicodeSet;
21 * Internal class to manage character names.
22 * Since data for names are stored
23 * in an array of char, by default indexes used in this class is refering to
24 * a 2 byte count, unless otherwise stated. Cases where the index is refering
25 * to a byte count, the index is halved and depending on whether the index is
26 * even or odd, the MSB or LSB of the result char at the halved index is
27 * returned. For indexes to an array of int, the index is multiplied by 2,
28 * result char at the multiplied index and its following char is returned as an
30 * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class
31 * Note : 0 - 0x1F are control characters without names in Unicode 3.0
32 * @author Syn Wee Quek
36 public final class UCharacterName
38 // public data members ----------------------------------------------
41 * public singleton instance
43 public static final UCharacterName INSTANCE;
47 INSTANCE = new UCharacterName();
48 } catch (IOException e) {
50 throw new MissingResourceException("Could not construct UCharacterName. Missing unames.icu","","");
56 * Number of lines per group
59 public static final int LINES_PER_GROUP_ = 1 << 5;
61 * Maximum number of groups
63 public int m_groupcount_ = 0;
65 // public methods ---------------------------------------------------
68 * Retrieve the name of a Unicode code point.
69 * Depending on <code>choice</code>, the character name written into the
70 * buffer is the "modern" name or the name that was defined in Unicode
72 * The name contains only "invariant" characters
73 * like A-Z, 0-9, space, and '-'.
75 * @param ch the code point for which to get the name.
76 * @param choice Selector for which name to get.
77 * @return if code point is above 0x1fff, null is returned
79 public String getName(int ch, int choice)
81 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE ||
82 choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) {
88 result = getAlgName(ch, choice);
90 // getting normal character name
91 if (result == null || result.length() == 0) {
92 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
93 result = getExtendedName(ch);
95 result = getGroupName(ch, choice);
103 * Find a character by its name and return its code point value
104 * @param choice selector to indicate if argument name is a Unicode 1.0
105 * or the most current version
106 * @param name the name to search for
109 public int getCharFromName(int choice, String name)
111 // checks for illegal arguments
112 if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT ||
113 name == null || name.length() == 0) {
117 // try extended names first
118 int result = getExtendedChar(name.toLowerCase(Locale.ENGLISH), choice);
123 String upperCaseName = name.toUpperCase(Locale.ENGLISH);
124 // try algorithmic names first, if fails then try group names
125 // int result = getAlgorithmChar(choice, uppercasename);
127 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME ||
128 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME
131 if (m_algorithm_ != null) {
132 count = m_algorithm_.length;
134 for (count --; count >= 0; count --) {
135 result = m_algorithm_[count].getChar(upperCaseName);
142 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
143 result = getGroupChar(upperCaseName,
144 UCharacterNameChoice.UNICODE_CHAR_NAME);
146 result = getGroupChar(upperCaseName,
147 UCharacterNameChoice.CHAR_NAME_ALIAS);
151 result = getGroupChar(upperCaseName, choice);
156 // these are all UCharacterNameIterator use methods -------------------
159 * Reads a block of compressed lengths of 32 strings and expands them into
160 * offsets and lengths for each string. Lengths are stored with a
161 * variable-width encoding in consecutive nibbles:
162 * If a nibble<0xc, then it is the length itself (0 = empty string).
163 * If a nibble>=0xc, then it forms a length value with the following
165 * The offsets and lengths arrays must be at least 33 (one more) long
166 * because there is no check here at the end if the last nibble is still
168 * @param index of group string object in array
169 * @param offsets array to store the value of the string offsets
170 * @param lengths array to store the value of the string length
171 * @return next index of the data string immediately after the lengths
172 * in terms of byte address
174 public int getGroupLengths(int index, char offsets[], char lengths[])
176 char length = 0xffff;
180 index = index * m_groupsize_; // byte count offsets of group strings
181 int stringoffset = UCharacterUtility.toInt(
182 m_groupinfo_[index + OFFSET_HIGH_OFFSET_],
183 m_groupinfo_[index + OFFSET_LOW_OFFSET_]);
187 // all 32 lengths must be read to get the offset of the first group
189 for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) {
190 b = m_groupstring_[stringoffset];
195 n = (byte)((b >> shift) & 0x0F);
196 if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
197 length = (char)((n - 12) << 4);
200 if (length != 0xffff) {
201 lengths[i] = (char)((length | n) + 12);
204 lengths[i] = (char)n;
207 if (i < LINES_PER_GROUP_) {
208 offsets[i + 1] = (char)(offsets[i] + lengths[i]);
222 * Gets the name of the argument group index.
223 * UnicodeData.txt uses ';' as a field separator, so no field can contain
224 * ';' as part of its contents. In unames.icu, it is marked as
225 * token[';'] == -1 only if the semicolon is used in the data file - which
226 * is iff we have Unicode 1.0 names or ISO comments or aliases.
227 * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments/aliases
228 * although we know that it will never be part of a name.
229 * Equivalent to ICU4C's expandName.
230 * @param index of the group name string in byte count
231 * @param length of the group name string
232 * @param choice of Unicode 1.0 name or the most current name
233 * @return name of the group
235 public String getGroupName(int index, int length, int choice)
237 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME &&
238 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME
240 if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) {
242 * skip the modern name if it is not requested _and_
243 * if the semicolon byte value is a character, not a token number
245 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice;
247 int oldindex = index;
248 index += UCharacterUtility.skipByteSubString(m_groupstring_,
249 index, length, (byte)';');
250 length -= (index - oldindex);
251 } while(--fieldIndex>0);
254 // the semicolon byte is a token number, therefore only modern
255 // names are stored in unames.dat and there is no such
256 // requested alternate name here
261 synchronized (m_utilStringBuffer_) {
262 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
265 for (int i = 0; i < length;) {
266 b = m_groupstring_[index + i];
269 if (b >= m_tokentable_.length) {
273 m_utilStringBuffer_.append(b); // implicit letter
276 token = m_tokentable_[b & 0x00ff];
277 if (token == 0xFFFE) {
278 // this is a lead byte for a double-byte token
279 token = m_tokentable_[b << 8 |
280 (m_groupstring_[index + i] & 0x00ff)];
283 if (token == 0xFFFF) {
285 // skip the semicolon if we are seeking extended
286 // names and there was no 2.0 name but there
288 if (m_utilStringBuffer_.length() == 0 && choice ==
289 UCharacterNameChoice.EXTENDED_CHAR_NAME) {
295 m_utilStringBuffer_.append((char)(b & 0x00ff));
297 else { // write token word
298 UCharacterUtility.getNullTermByteSubString(
299 m_utilStringBuffer_, m_tokenstring_, token);
304 if (m_utilStringBuffer_.length() > 0) {
305 return m_utilStringBuffer_.toString();
312 * Retrieves the extended name
314 public String getExtendedName(int ch)
316 String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
317 if (result == null) {
318 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F.
319 result = getExtendedOr10Name(ch);
325 * Gets the group index for the codepoint, or the group before it.
326 * @param codepoint The codepoint index.
327 * @return group index containing codepoint or the group before it.
329 public int getGroup(int codepoint)
331 int endGroup = m_groupcount_;
332 int msb = getCodepointMSB(codepoint);
334 // binary search for the group of names that contains the one for
336 // find the group that contains codepoint, or the highest before it
337 while (result < endGroup - 1) {
338 int gindex = (result + endGroup) >> 1;
339 if (msb < getGroupMSB(gindex)) {
350 * Gets the extended and 1.0 name when the most current unicode names
352 * @param ch codepoint
353 * @return name of codepoint extended or 1.0
355 public String getExtendedOr10Name(int ch)
357 String result = null;
358 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F.
359 if (result == null) {
360 int type = getType(ch);
361 // Return unknown if the table of names above is not up to
363 if (type >= TYPE_NAMES_.length) {
364 result = UNKNOWN_TYPE_NAME_;
367 result = TYPE_NAMES_[type];
369 synchronized (m_utilStringBuffer_) {
370 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
371 m_utilStringBuffer_.append('<');
372 m_utilStringBuffer_.append(result);
373 m_utilStringBuffer_.append('-');
374 String chStr = Integer.toHexString(ch).toUpperCase(Locale.ENGLISH);
375 int zeros = 4 - chStr.length();
377 m_utilStringBuffer_.append('0');
380 m_utilStringBuffer_.append(chStr);
381 m_utilStringBuffer_.append('>');
382 result = m_utilStringBuffer_.toString();
389 * Gets the MSB from the group index
390 * @param gindex group index
391 * @return the MSB of the group if gindex is valid, -1 otherwise
393 public int getGroupMSB(int gindex)
395 if (gindex >= m_groupcount_) {
398 return m_groupinfo_[gindex * m_groupsize_];
402 * Gets the MSB of the codepoint
403 * @param codepoint The codepoint value.
404 * @return the MSB of the codepoint
406 public static int getCodepointMSB(int codepoint)
408 return codepoint >> GROUP_SHIFT_;
412 * Gets the maximum codepoint + 1 of the group
413 * @param msb most significant byte of the group
414 * @return limit codepoint of the group
416 public static int getGroupLimit(int msb)
418 return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
422 * Gets the minimum codepoint of the group
423 * @param msb most significant byte of the group
424 * @return minimum codepoint of the group
426 public static int getGroupMin(int msb)
428 return msb << GROUP_SHIFT_;
432 * Gets the offset to a group
433 * @param codepoint The codepoint value.
434 * @return offset to a group
436 public static int getGroupOffset(int codepoint)
438 return codepoint & GROUP_MASK_;
442 * Gets the minimum codepoint of a group
443 * @param codepoint The codepoint value.
444 * @return minimum codepoint in the group which codepoint belongs to
447 public static int getGroupMinFromCodepoint(int codepoint)
449 return codepoint & ~GROUP_MASK_;
454 * Get the Algorithm range length
455 * @return Algorithm range length
457 public int getAlgorithmLength()
459 return m_algorithm_.length;
463 * Gets the start of the range
464 * @param index algorithm index
465 * @return algorithm range start
467 public int getAlgorithmStart(int index)
469 return m_algorithm_[index].m_rangestart_;
473 * Gets the end of the range
474 * @param index algorithm index
475 * @return algorithm range end
477 public int getAlgorithmEnd(int index)
479 return m_algorithm_[index].m_rangeend_;
483 * Gets the Algorithmic name of the codepoint
484 * @param index algorithmic range index
485 * @param codepoint The codepoint value.
486 * @return algorithmic name of codepoint
488 public String getAlgorithmName(int index, int codepoint)
490 String result = null;
491 synchronized (m_utilStringBuffer_) {
492 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
493 m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_);
494 result = m_utilStringBuffer_.toString();
500 * Gets the group name of the character
501 * @param ch character to get the group name
502 * @param choice name choice selector to choose a unicode 1.0 or newer name
504 public synchronized String getGroupName(int ch, int choice)
507 int msb = getCodepointMSB(ch);
508 int group = getGroup(ch);
510 // return this if it is an exact match
511 if (msb == m_groupinfo_[group * m_groupsize_]) {
512 int index = getGroupLengths(group, m_groupoffsets_,
514 int offset = ch & GROUP_MASK_;
515 return getGroupName(index + m_groupoffsets_[offset],
516 m_grouplengths_[offset], choice);
522 // these are transliterator use methods ---------------------------------
525 * Gets the maximum length of any codepoint name.
526 * Equivalent to uprv_getMaxCharNameLength.
527 * @return the maximum length of any codepoint name
529 public int getMaxCharNameLength()
531 if (initNameSetsLengths()) {
532 return m_maxNameLength_;
540 * Gets the maximum length of any iso comments.
541 * Equivalent to uprv_getMaxISOCommentLength.
542 * @return the maximum length of any codepoint name
545 public int getMaxISOCommentLength()
547 if (initNameSetsLengths()) {
548 return m_maxISOCommentLength_;
557 * Fills set with characters that are used in Unicode character names.
558 * Equivalent to uprv_getCharNameCharacters.
559 * @param set USet to receive characters. Existing contents are deleted.
561 public void getCharNameCharacters(UnicodeSet set)
563 convert(m_nameSet_, set);
567 * Fills set with characters that are used in Unicode character names.
568 * Equivalent to uprv_getISOCommentCharacters.
569 * @param set USet to receive characters. Existing contents are deleted.
572 public void getISOCommentCharacters(UnicodeSet set)
574 convert(m_ISOCommentSet_, set);
578 // package private inner class --------------------------------------
581 * Algorithmic name class
583 static final class AlgorithmName
585 // package private data members ----------------------------------
588 * Constant type value of the different AlgorithmName
590 static final int TYPE_0_ = 0;
591 static final int TYPE_1_ = 1;
593 // package private constructors ----------------------------------
602 // package private methods ---------------------------------------
605 * Sets the information for accessing the algorithmic names
606 * @param rangestart starting code point that lies within this name group
607 * @param rangeend end code point that lies within this name group
608 * @param type algorithm type. There's 2 kinds of algorithmic type. First
609 * which uses code point as part of its name and the other uses
610 * variant postfix strings
611 * @param variant algorithmic variant
612 * @return true if values are valid
614 boolean setInfo(int rangestart, int rangeend, byte type, byte variant)
616 if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend
617 && rangeend <= UCharacter.MAX_VALUE &&
618 (type == TYPE_0_ || type == TYPE_1_)) {
619 m_rangestart_ = rangestart;
620 m_rangeend_ = rangeend;
622 m_variant_ = variant;
629 * Sets the factor data
630 * @param factor Array of factor
631 * @return true if factors are valid
633 boolean setFactor(char factor[])
635 if (factor.length == m_variant_) {
643 * Sets the name prefix
645 * @return true if prefix is set
647 boolean setPrefix(String prefix)
649 if (prefix != null && prefix.length() > 0) {
657 * Sets the variant factorized name data
658 * @param string variant factorized name data
659 * @return true if values are set
661 boolean setFactorString(byte string[])
663 // factor and variant string can be empty for things like
664 // hanggul code points
665 m_factorstring_ = string;
670 * Checks if code point lies in Algorithm object at index
671 * @param ch code point
673 boolean contains(int ch)
675 return m_rangestart_ <= ch && ch <= m_rangeend_;
679 * Appends algorithm name of code point into StringBuffer.
680 * Note this method does not check for validity of code point in Algorithm,
681 * result is undefined if code point does not belong in Algorithm.
682 * @param ch code point
683 * @param str StringBuffer to append to
685 void appendName(int ch, StringBuffer str)
687 str.append(m_prefix_);
691 // prefix followed by hex digits indicating variants
692 str.append(Utility.hex(ch,m_variant_));
695 // prefix followed by factorized-elements
696 int offset = ch - m_rangestart_;
697 int indexes[] = m_utilIntBuffer_;
700 // write elements according to the factors
701 // the factorized elements are determined by modulo
703 synchronized (m_utilIntBuffer_) {
704 for (int i = m_variant_ - 1; i > 0; i --)
706 factor = m_factor_[i] & 0x00FF;
707 indexes[i] = offset % factor;
711 // we don't need to calculate the last modulus because
712 // start <= code <= end guarantees here that
713 // code <= factors[0]
716 // joining up the factorized strings
717 str.append(getFactorString(indexes, m_variant_));
724 * Gets the character for the argument algorithmic name
725 * @return the algorithmic char or -1 otherwise.
727 int getChar(String name)
729 int prefixlen = m_prefix_.length();
730 if (name.length() < prefixlen ||
731 !m_prefix_.equals(name.substring(0, prefixlen))) {
740 int result = Integer.parseInt(name.substring(prefixlen),
742 // does it fit into the range?
743 if (m_rangestart_ <= result && result <= m_rangeend_) {
747 catch (NumberFormatException e)
753 // repetitative suffix name comparison done here
754 // offset is the character code - start
755 for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++)
757 int offset = ch - m_rangestart_;
758 int indexes[] = m_utilIntBuffer_;
761 // write elements according to the factors
762 // the factorized elements are determined by modulo
764 synchronized (m_utilIntBuffer_) {
765 for (int i = m_variant_ - 1; i > 0; i --)
767 factor = m_factor_[i] & 0x00FF;
768 indexes[i] = offset % factor;
772 // we don't need to calculate the last modulus
773 // because start <= code <= end guarantees here that
774 // code <= factors[0]
777 // joining up the factorized strings
778 if (compareFactorString(indexes, m_variant_, name,
790 * Adds all chars in the set of algorithmic names into the set.
791 * Equivalent to part of calcAlgNameSetsLengths.
792 * @param set int set to add the chars of the algorithm names into
793 * @param maxlength maximum length to compare to
794 * @return the length that is either maxlength of the length of this
795 * algorithm name if it is longer than maxlength
797 int add(int set[], int maxlength)
800 int length = UCharacterName.add(set, m_prefix_);
803 // name = prefix + (range->variant times) hex-digits
805 length += m_variant_;
807 * addString(set, (const char *)(range + 1))
812 // name = prefix factorized-elements
813 // get the set and maximum factor suffix length for each
815 for (int i = m_variant_ - 1; i > 0; i --)
817 int maxfactorlength = 0;
819 for (int factor = m_factor_[i]; factor > 0; -- factor) {
820 synchronized (m_utilStringBuffer_) {
821 m_utilStringBuffer_.delete(0,
822 m_utilStringBuffer_.length());
824 = UCharacterUtility.getNullTermByteSubString(
826 m_factorstring_, count);
827 UCharacterName.add(set, m_utilStringBuffer_);
828 if (m_utilStringBuffer_.length()
832 = m_utilStringBuffer_.length();
836 length += maxfactorlength;
840 if (length > maxlength) {
846 // private data members ------------------------------------------
849 * Algorithmic data information
851 private int m_rangestart_;
852 private int m_rangeend_;
853 private byte m_type_;
854 private byte m_variant_;
855 private char m_factor_[];
856 private String m_prefix_;
857 private byte m_factorstring_[];
859 * Utility StringBuffer
861 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
865 private int m_utilIntBuffer_[] = new int[256];
867 // private methods -----------------------------------------------
870 * Gets the indexth string in each of the argument factor block
871 * @param index array with each index corresponding to each factor block
872 * @param length length of the array index
873 * @return the combined string of the array of indexth factor string in
876 private String getFactorString(int index[], int length)
878 int size = m_factor_.length;
879 if (index == null || length != size) {
883 synchronized (m_utilStringBuffer_) {
884 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
888 for (int i = 0; i <= size; i ++) {
889 factor = m_factor_[i];
890 count = UCharacterUtility.skipNullTermByteSubString(
891 m_factorstring_, count, index[i]);
892 count = UCharacterUtility.getNullTermByteSubString(
893 m_utilStringBuffer_, m_factorstring_,
896 count = UCharacterUtility.skipNullTermByteSubString(
897 m_factorstring_, count,
898 factor - index[i] - 1);
901 return m_utilStringBuffer_.toString();
906 * Compares the indexth string in each of the argument factor block with
907 * the argument string
908 * @param index array with each index corresponding to each factor block
909 * @param length index array length
910 * @param str string to compare with
911 * @param offset of str to start comparison
912 * @return true if string matches
914 private boolean compareFactorString(int index[], int length, String str,
917 int size = m_factor_.length;
918 if (index == null || length != size)
922 int strcount = offset;
925 for (int i = 0; i <= size; i ++)
927 factor = m_factor_[i];
928 count = UCharacterUtility.skipNullTermByteSubString(
929 m_factorstring_, count, index[i]);
930 strcount = UCharacterUtility.compareNullTermByteSubString(str,
931 m_factorstring_, strcount, count);
937 count = UCharacterUtility.skipNullTermByteSubString(
938 m_factorstring_, count, factor - index[i]);
941 if (strcount != str.length()) {
948 // package private data members --------------------------------------
951 * Size of each groups
953 int m_groupsize_ = 0;
955 // package private methods --------------------------------------------
958 * Sets the token data
959 * @param token array of tokens
960 * @param tokenstring array of string values of the tokens
961 * @return false if there is a data error
963 boolean setToken(char token[], byte tokenstring[])
965 if (token != null && tokenstring != null && token.length > 0 &&
966 tokenstring.length > 0) {
967 m_tokentable_ = token;
968 m_tokenstring_ = tokenstring;
975 * Set the algorithm name information array
976 * @param alg Algorithm information array
977 * @return true if the group string offset has been set correctly
979 boolean setAlgorithm(AlgorithmName alg[])
981 if (alg != null && alg.length != 0) {
989 * Sets the number of group and size of each group in number of char
990 * @param count number of groups
991 * @param size size of group in char
992 * @return true if group size is set correctly
994 boolean setGroupCountSize(int count, int size)
996 if (count <= 0 || size <= 0) {
999 m_groupcount_ = count;
1000 m_groupsize_ = size;
1005 * Sets the group name data
1006 * @param group index information array
1007 * @param groupstring name information array
1008 * @return false if there is a data error
1010 boolean setGroup(char group[], byte groupstring[])
1012 if (group != null && groupstring != null && group.length > 0 &&
1013 groupstring.length > 0) {
1014 m_groupinfo_ = group;
1015 m_groupstring_ = groupstring;
1021 // private data members ----------------------------------------------
1024 * Data used in unames.icu
1026 private char m_tokentable_[];
1027 private byte m_tokenstring_[];
1028 private char m_groupinfo_[];
1029 private byte m_groupstring_[];
1030 private AlgorithmName m_algorithm_[];
1033 * Group use. Note - access must be synchronized.
1035 private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
1036 private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
1039 * Default name of the name datafile
1041 private static final String NAME_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/unames.icu";
1043 * Shift count to retrieve group information
1045 private static final int GROUP_SHIFT_ = 5;
1047 * Mask to retrieve the offset for a particular character within a group
1049 private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
1051 * Default buffer size of datafile
1053 private static final int NAME_BUFFER_SIZE_ = 100000;
1056 * Position of offsethigh in group information array
1058 private static final int OFFSET_HIGH_OFFSET_ = 1;
1061 * Position of offsetlow in group information array
1063 private static final int OFFSET_LOW_OFFSET_ = 2;
1065 * Double nibble indicator, any nibble > this number has to be combined
1066 * with its following nibble
1068 private static final int SINGLE_NIBBLE_MAX_ = 11;
1071 * Maximum length of character names (regular & 1.0).
1073 //private static int MAX_NAME_LENGTH_ = 0;
1075 * Maximum length of ISO comments.
1077 //private static int MAX_ISO_COMMENT_LENGTH_ = 0;
1080 * Set of chars used in character names (regular & 1.0).
1081 * Chars are platform-dependent (can be EBCDIC).
1083 private int m_nameSet_[] = new int[8];
1085 * Set of chars used in ISO comments. (regular & 1.0).
1086 * Chars are platform-dependent (can be EBCDIC).
1088 private int m_ISOCommentSet_[] = new int[8];
1090 * Utility StringBuffer
1092 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
1094 * Utility int buffer
1096 private int m_utilIntBuffer_[] = new int[2];
1098 * Maximum ISO comment length
1100 private int m_maxISOCommentLength_;
1102 * Maximum name length
1104 private int m_maxNameLength_;
1106 * Type names used for extended names
1108 private static final String TYPE_NAMES_[] = {"unassigned",
1116 "combining spacing mark",
1117 "decimal digit number",
1122 "paragraph separator",
1128 "start punctuation",
1130 "connector punctuation",
1131 "other punctuation",
1136 "initial punctuation",
1137 "final punctuation",
1144 private static final String UNKNOWN_TYPE_NAME_ = "unknown";
1146 * Not a character type
1148 private static final int NON_CHARACTER_
1149 = UCharacterCategory.CHAR_CATEGORY_COUNT;
1151 * Lead surrogate type
1153 private static final int LEAD_SURROGATE_
1154 = UCharacterCategory.CHAR_CATEGORY_COUNT + 1;
1156 * Trail surrogate type
1158 private static final int TRAIL_SURROGATE_
1159 = UCharacterCategory.CHAR_CATEGORY_COUNT + 2;
1161 * Extended category count
1163 static final int EXTENDED_CATEGORY_
1164 = UCharacterCategory.CHAR_CATEGORY_COUNT + 3;
1166 // private constructor ------------------------------------------------
1169 * <p>Protected constructor for use in UCharacter.</p>
1170 * @exception IOException thrown when data reading fails
1172 private UCharacterName() throws IOException
1174 InputStream is = ICUData.getRequiredStream(NAME_FILE_NAME_);
1175 BufferedInputStream b = new BufferedInputStream(is, NAME_BUFFER_SIZE_);
1176 UCharacterNameReader reader = new UCharacterNameReader(b);
1181 // private methods ---------------------------------------------------
1184 * Gets the algorithmic name for the argument character
1185 * @param ch character to determine name for
1186 * @param choice name choice
1187 * @return the algorithmic name or null if not found
1189 private String getAlgName(int ch, int choice)
1191 /* Only the normative character name can be algorithmic. */
1192 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME ||
1193 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME
1195 // index in terms integer index
1196 synchronized (m_utilStringBuffer_) {
1197 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
1199 for (int index = m_algorithm_.length - 1; index >= 0; index --)
1201 if (m_algorithm_[index].contains(ch)) {
1202 m_algorithm_[index].appendName(ch, m_utilStringBuffer_);
1203 return m_utilStringBuffer_.toString();
1212 * Getting the character with the tokenized argument name
1213 * @param name of the character
1214 * @return character with the tokenized argument name or -1 if character
1217 private synchronized int getGroupChar(String name, int choice)
1219 for (int i = 0; i < m_groupcount_; i ++) {
1220 // populating the data set of grouptable
1222 int startgpstrindex = getGroupLengths(i, m_groupoffsets_,
1225 // shift out to function
1226 int result = getGroupChar(startgpstrindex, m_grouplengths_, name,
1229 return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_)
1237 * Compares and retrieve character if name is found within the argument
1239 * @param index index where the set of names reside in the group block
1240 * @param length list of lengths of the strings
1241 * @param name character name to search for
1242 * @param choice of either 1.0 or the most current unicode name
1243 * @return relative character in the group which matches name, otherwise if
1244 * not found, -1 will be returned
1246 private int getGroupChar(int index, char length[], String name,
1252 int namelen = name.length();
1256 for (int result = 0; result <= LINES_PER_GROUP_; result ++) {
1258 len = length[result];
1260 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME &&
1261 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME
1264 * skip the modern name if it is not requested _and_
1265 * if the semicolon byte value is a character, not a token number
1267 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice;
1269 int oldindex = index;
1270 index += UCharacterUtility.skipByteSubString(m_groupstring_,
1271 index, len, (byte)';');
1272 len -= (index - oldindex);
1273 } while(--fieldIndex>0);
1276 // number of tokens is > the length of the name
1277 // write each letter directly, and write a token word per token
1278 for (count = 0; count < len && nindex != -1 && nindex < namelen;
1280 b = m_groupstring_[index + count];
1283 if (b >= m_tokentable_.length) {
1284 if (name.charAt(nindex ++) != (b & 0xFF)) {
1289 token = m_tokentable_[b & 0xFF];
1290 if (token == 0xFFFE) {
1291 // this is a lead byte for a double-byte token
1292 token = m_tokentable_[b << 8 |
1293 (m_groupstring_[index + count] & 0x00ff)];
1296 if (token == 0xFFFF) {
1297 if (name.charAt(nindex ++) != (b & 0xFF)) {
1302 // compare token with name
1303 nindex = UCharacterUtility.compareNullTermByteSubString(
1304 name, m_tokenstring_, nindex, token);
1309 if (namelen == nindex &&
1310 (count == len || m_groupstring_[index + count] == ';')) {
1320 * Gets the character extended type
1321 * @param ch character to be tested
1322 * @return extended type it is associated with
1324 private static int getType(int ch)
1326 if (UCharacterUtility.isNonCharacter(ch)) {
1327 // not a character we return a invalid category count
1328 return NON_CHARACTER_;
1330 int result = UCharacter.getType(ch);
1331 if (result == UCharacterCategory.SURROGATE) {
1332 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
1333 result = LEAD_SURROGATE_;
1336 result = TRAIL_SURROGATE_;
1343 * Getting the character with extended name of the form <....>.
1344 * @param name of the character to be found
1345 * @param choice name choice
1346 * @return character associated with the name, -1 if such character is not
1347 * found and -2 if we should continue with the search.
1349 private static int getExtendedChar(String name, int choice)
1351 if (name.charAt(0) == '<') {
1352 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
1353 int endIndex = name.length() - 1;
1354 if (name.charAt(endIndex) == '>') {
1355 int startIndex = name.lastIndexOf('-');
1356 if (startIndex >= 0) { // We've got a category.
1360 result = Integer.parseInt(
1361 name.substring(startIndex, endIndex),
1364 catch (NumberFormatException e) {
1367 // Now validate the category name. We could use a
1368 // binary search, or a trie, if we really wanted to.
1369 String type = name.substring(1, startIndex - 1);
1370 int length = TYPE_NAMES_.length;
1371 for (int i = 0; i < length; ++ i) {
1372 if (type.compareTo(TYPE_NAMES_[i]) == 0) {
1373 if (getType(result) == i) {
1387 // sets of name characters, maximum name lengths -----------------------
1390 * Adds a codepoint into a set of ints.
1391 * Equivalent to SET_ADD.
1392 * @param set set to add to
1393 * @param ch 16 bit char to add
1395 private static void add(int set[], char ch)
1397 set[ch >>> 5] |= 1 << (ch & 0x1f);
1401 * Checks if a codepoint is a part of a set of ints.
1402 * Equivalent to SET_CONTAINS.
1403 * @param set set to check in
1404 * @param ch 16 bit char to check
1405 * @return true if codepoint is part of the set, false otherwise
1407 private static boolean contains(int set[], char ch)
1409 return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0;
1413 * Adds all characters of the argument str and gets the length
1414 * Equivalent to calcStringSetLength.
1415 * @param set set to add all chars of str to
1416 * @param str string to add
1418 private static int add(int set[], String str)
1420 int result = str.length();
1422 for (int i = result - 1; i >= 0; i --) {
1423 add(set, str.charAt(i));
1429 * Adds all characters of the argument str and gets the length
1430 * Equivalent to calcStringSetLength.
1431 * @param set set to add all chars of str to
1432 * @param str string to add
1434 private static int add(int set[], StringBuffer str)
1436 int result = str.length();
1438 for (int i = result - 1; i >= 0; i --) {
1439 add(set, str.charAt(i));
1445 * Adds all algorithmic names into the name set.
1446 * Equivalent to part of calcAlgNameSetsLengths.
1447 * @param maxlength length to compare to
1448 * @return the maximum length of any possible algorithmic name if it is >
1449 * maxlength, otherwise maxlength is returned.
1451 private int addAlgorithmName(int maxlength)
1454 for (int i = m_algorithm_.length - 1; i >= 0; i --) {
1455 result = m_algorithm_[i].add(m_nameSet_, maxlength);
1456 if (result > maxlength) {
1464 * Adds all extended names into the name set.
1465 * Equivalent to part of calcExtNameSetsLengths.
1466 * @param maxlength length to compare to
1467 * @return the maxlength of any possible extended name.
1469 private int addExtendedName(int maxlength)
1471 for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) {
1472 // for each category, count the length of the category name
1476 // 6 for most hex digits per code point
1477 int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]);
1478 if (length > maxlength) {
1486 * Adds names of a group to the argument set.
1487 * Equivalent to calcNameSetLength.
1488 * @param offset of the group name string in byte count
1489 * @param length of the group name string
1490 * @param tokenlength array to store the length of each token
1491 * @param set to add to
1492 * @return the length of the name string and the length of the group
1495 private int[] addGroupName(int offset, int length, byte tokenlength[],
1498 int resultnlength = 0;
1499 int resultplength = 0;
1500 while (resultplength < length) {
1501 char b = (char)(m_groupstring_[offset + resultplength] & 0xff);
1507 if (b >= m_tokentable_.length) {
1508 add(set, b); // implicit letter
1512 char token = m_tokentable_[b & 0x00ff];
1513 if (token == 0xFFFE) {
1514 // this is a lead byte for a double-byte token
1515 b = (char)(b << 8 | (m_groupstring_[offset + resultplength]
1517 token = m_tokentable_[b];
1520 if (token == 0xFFFF) {
1526 // use cached token length
1527 byte tlength = tokenlength[b];
1529 synchronized (m_utilStringBuffer_) {
1530 m_utilStringBuffer_.delete(0,
1531 m_utilStringBuffer_.length());
1532 UCharacterUtility.getNullTermByteSubString(
1533 m_utilStringBuffer_, m_tokenstring_,
1535 tlength = (byte)add(set, m_utilStringBuffer_);
1537 tokenlength[b] = tlength;
1539 resultnlength += tlength;
1543 m_utilIntBuffer_[0] = resultnlength;
1544 m_utilIntBuffer_[1] = resultplength;
1545 return m_utilIntBuffer_;
1549 * Adds names of all group to the argument set.
1550 * Sets the data member m_max*Length_.
1551 * Method called only once.
1552 * Equivalent to calcGroupNameSetsLength.
1553 * @param maxlength length to compare to
1555 private void addGroupName(int maxlength)
1557 int maxisolength = 0;
1558 char offsets[] = new char[LINES_PER_GROUP_ + 2];
1559 char lengths[] = new char[LINES_PER_GROUP_ + 2];
1560 byte tokenlengths[] = new byte[m_tokentable_.length];
1562 // enumerate all groups
1563 // for (int i = m_groupcount_ - 1; i >= 0; i --) {
1564 for (int i = 0; i < m_groupcount_ ; i ++) {
1565 int offset = getGroupLengths(i, offsets, lengths);
1566 // enumerate all lines in each group
1567 // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0;
1569 for (int linenumber = 0; linenumber < LINES_PER_GROUP_;
1571 int lineoffset = offset + offsets[linenumber];
1572 int length = lengths[linenumber];
1577 // read regular name
1578 int parsed[] = addGroupName(lineoffset, length, tokenlengths,
1580 if (parsed[0] > maxlength) {
1581 // 0 for name length
1582 maxlength = parsed[0];
1584 lineoffset += parsed[1];
1585 if (parsed[1] >= length) {
1586 // 1 for parsed group string length
1589 length -= parsed[1];
1590 // read Unicode 1.0 name
1591 parsed = addGroupName(lineoffset, length, tokenlengths,
1593 if (parsed[0] > maxlength) {
1594 // 0 for name length
1595 maxlength = parsed[0];
1597 lineoffset += parsed[1];
1598 if (parsed[1] >= length) {
1599 // 1 for parsed group string length
1602 length -= parsed[1];
1604 parsed = addGroupName(lineoffset, length, tokenlengths,
1606 if (parsed[1] > maxisolength) {
1607 maxisolength = length;
1612 // set gMax... - name length last for threading
1613 m_maxISOCommentLength_ = maxisolength;
1614 m_maxNameLength_ = maxlength;
1618 * Sets up the name sets and the calculation of the maximum lengths.
1619 * Equivalent to calcNameSetsLengths.
1621 private boolean initNameSetsLengths()
1623 if (m_maxNameLength_ > 0) {
1627 String extra = "0123456789ABCDEF<>-";
1628 // set hex digits, used in various names, and <>-, used in extended
1630 for (int i = extra.length() - 1; i >= 0; i --) {
1631 add(m_nameSet_, extra.charAt(i));
1634 // set sets and lengths from algorithmic names
1635 m_maxNameLength_ = addAlgorithmName(0);
1636 // set sets and lengths from extended names
1637 m_maxNameLength_ = addExtendedName(m_maxNameLength_);
1638 // set sets and lengths from group names, set global maximum values
1639 addGroupName(m_maxNameLength_);
1644 * Converts the char set cset into a Unicode set uset.
1645 * Equivalent to charSetToUSet.
1646 * @param set Set of 256 bit flags corresponding to a set of chars.
1647 * @param uset USet to receive characters. Existing contents are deleted.
1649 private void convert(int set[], UnicodeSet uset)
1652 if (!initNameSetsLengths()) {
1656 // build a char string with all chars that are used in character names
1657 for (char c = 255; c > 0; c --) {
1658 if (contains(set, c)) {