2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.BufferedInputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
12 import java.util.MissingResourceException;
\r
14 import com.ibm.icu.lang.UCharacter;
\r
15 import com.ibm.icu.lang.UCharacterCategory;
\r
16 import com.ibm.icu.text.UTF16;
\r
17 import com.ibm.icu.text.UnicodeSet;
\r
20 * Internal class to manage character names.
\r
21 * Since data for names are stored
\r
22 * in an array of char, by default indexes used in this class is refering to
\r
23 * a 2 byte count, unless otherwise stated. Cases where the index is refering
\r
24 * to a byte count, the index is halved and depending on whether the index is
\r
25 * even or odd, the MSB or LSB of the result char at the halved index is
\r
26 * returned. For indexes to an array of int, the index is multiplied by 2,
\r
27 * result char at the multiplied index and its following char is returned as an
\r
29 * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class
\r
30 * Note : 0 - 0x1F are control characters without names in Unicode 3.0
\r
31 * @author Syn Wee Quek
\r
35 public final class UCharacterName
\r
37 // public data members ----------------------------------------------
\r
40 * public singleton instance
\r
42 public static final UCharacterName INSTANCE;
\r
46 INSTANCE = new UCharacterName();
\r
47 } catch (IOException e) {
\r
49 throw new MissingResourceException("Could not construct UCharacterName. Missing unames.icu","","");
\r
55 * Number of lines per group
\r
58 public static final int LINES_PER_GROUP_ = 1 << 5;
\r
60 * Maximum number of groups
\r
62 public int m_groupcount_ = 0;
\r
64 // public methods ---------------------------------------------------
\r
67 * Retrieve the name of a Unicode code point.
\r
68 * Depending on <code>choice</code>, the character name written into the
\r
69 * buffer is the "modern" name or the name that was defined in Unicode
\r
71 * The name contains only "invariant" characters
\r
72 * like A-Z, 0-9, space, and '-'.
\r
74 * @param ch the code point for which to get the name.
\r
75 * @param choice Selector for which name to get.
\r
76 * @return if code point is above 0x1fff, null is returned
\r
78 public String getName(int ch, int choice)
\r
80 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE ||
\r
81 choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) {
\r
85 String result = null;
\r
87 result = getAlgName(ch, choice);
\r
89 // getting normal character name
\r
90 if (result == null || result.length() == 0) {
\r
91 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
92 result = getExtendedName(ch);
\r
94 result = getGroupName(ch, choice);
\r
102 * Find a character by its name and return its code point value
\r
103 * @param choice selector to indicate if argument name is a Unicode 1.0
\r
104 * or the most current version
\r
105 * @param name the name to search for
\r
106 * @return code point
\r
108 public int getCharFromName(int choice, String name)
\r
110 // checks for illegal arguments
\r
111 if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT ||
\r
112 name == null || name.length() == 0) {
\r
116 // try extended names first
\r
117 int result = getExtendedChar(name.toLowerCase(), choice);
\r
118 if (result >= -1) {
\r
122 String upperCaseName = name.toUpperCase();
\r
123 // try algorithmic names first, if fails then try group names
\r
124 // int result = getAlgorithmChar(choice, uppercasename);
\r
126 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME ||
\r
127 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME
\r
130 if (m_algorithm_ != null) {
\r
131 count = m_algorithm_.length;
\r
133 for (count --; count >= 0; count --) {
\r
134 result = m_algorithm_[count].getChar(upperCaseName);
\r
141 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
142 result = getGroupChar(upperCaseName,
\r
143 UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
144 if (result == -1) {
\r
145 result = getGroupChar(upperCaseName,
\r
146 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
148 if (result == -1) {
\r
149 result = getGroupChar(upperCaseName,
\r
150 UCharacterNameChoice.CHAR_NAME_ALIAS);
\r
154 result = getGroupChar(upperCaseName, choice);
\r
159 // these are all UCharacterNameIterator use methods -------------------
\r
162 * Reads a block of compressed lengths of 32 strings and expands them into
\r
163 * offsets and lengths for each string. Lengths are stored with a
\r
164 * variable-width encoding in consecutive nibbles:
\r
165 * If a nibble<0xc, then it is the length itself (0 = empty string).
\r
166 * If a nibble>=0xc, then it forms a length value with the following
\r
168 * The offsets and lengths arrays must be at least 33 (one more) long
\r
169 * because there is no check here at the end if the last nibble is still
\r
171 * @param index of group string object in array
\r
172 * @param offsets array to store the value of the string offsets
\r
173 * @param lengths array to store the value of the string length
\r
174 * @return next index of the data string immediately after the lengths
\r
175 * in terms of byte address
\r
177 public int getGroupLengths(int index, char offsets[], char lengths[])
\r
179 char length = 0xffff;
\r
183 index = index * m_groupsize_; // byte count offsets of group strings
\r
184 int stringoffset = UCharacterUtility.toInt(
\r
185 m_groupinfo_[index + OFFSET_HIGH_OFFSET_],
\r
186 m_groupinfo_[index + OFFSET_LOW_OFFSET_]);
\r
190 // all 32 lengths must be read to get the offset of the first group
\r
192 for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) {
\r
193 b = m_groupstring_[stringoffset];
\r
196 while (shift >= 0) {
\r
198 n = (byte)((b >> shift) & 0x0F);
\r
199 if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
\r
200 length = (char)((n - 12) << 4);
\r
203 if (length != 0xffff) {
\r
204 lengths[i] = (char)((length | n) + 12);
\r
207 lengths[i] = (char)n;
\r
210 if (i < LINES_PER_GROUP_) {
\r
211 offsets[i + 1] = (char)(offsets[i] + lengths[i]);
\r
221 return stringoffset;
\r
225 * Gets the name of the argument group index.
\r
226 * UnicodeData.txt uses ';' as a field separator, so no field can contain
\r
227 * ';' as part of its contents. In unames.icu, it is marked as
\r
228 * token[';'] == -1 only if the semicolon is used in the data file - which
\r
229 * is iff we have Unicode 1.0 names or ISO comments or aliases.
\r
230 * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments/aliases
\r
231 * although we know that it will never be part of a name.
\r
232 * Equivalent to ICU4C's expandName.
\r
233 * @param index of the group name string in byte count
\r
234 * @param length of the group name string
\r
235 * @param choice of Unicode 1.0 name or the most current name
\r
236 * @return name of the group
\r
238 public String getGroupName(int index, int length, int choice)
\r
240 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME &&
\r
241 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME
\r
243 if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) {
\r
245 * skip the modern name if it is not requested _and_
\r
246 * if the semicolon byte value is a character, not a token number
\r
248 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice;
\r
250 int oldindex = index;
\r
251 index += UCharacterUtility.skipByteSubString(m_groupstring_,
\r
252 index, length, (byte)';');
\r
253 length -= (index - oldindex);
\r
254 } while(--fieldIndex>0);
\r
257 // the semicolon byte is a token number, therefore only modern
\r
258 // names are stored in unames.dat and there is no such
\r
259 // requested alternate name here
\r
264 synchronized (m_utilStringBuffer_) {
\r
265 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
268 for (int i = 0; i < length;) {
\r
269 b = m_groupstring_[index + i];
\r
272 if (b >= m_tokentable_.length) {
\r
276 m_utilStringBuffer_.append(b); // implicit letter
\r
279 token = m_tokentable_[b & 0x00ff];
\r
280 if (token == 0xFFFE) {
\r
281 // this is a lead byte for a double-byte token
\r
282 token = m_tokentable_[b << 8 |
\r
283 (m_groupstring_[index + i] & 0x00ff)];
\r
286 if (token == 0xFFFF) {
\r
288 // skip the semicolon if we are seeking extended
\r
289 // names and there was no 2.0 name but there
\r
291 if (m_utilStringBuffer_.length() == 0 && choice ==
\r
292 UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
298 m_utilStringBuffer_.append((char)(b & 0x00ff));
\r
300 else { // write token word
\r
301 UCharacterUtility.getNullTermByteSubString(
\r
302 m_utilStringBuffer_, m_tokenstring_, token);
\r
307 if (m_utilStringBuffer_.length() > 0) {
\r
308 return m_utilStringBuffer_.toString();
\r
315 * Retrieves the extended name
\r
317 public String getExtendedName(int ch)
\r
319 String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
320 if (result == null) {
\r
321 if (getType(ch) == UCharacterCategory.CONTROL) {
\r
322 result = getName(ch,
\r
323 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
325 if (result == null) {
\r
326 result = getExtendedOr10Name(ch);
\r
333 * Gets the group index for the codepoint, or the group before it.
\r
334 * @param codepoint The codepoint index.
\r
335 * @return group index containing codepoint or the group before it.
\r
337 public int getGroup(int codepoint)
\r
339 int endGroup = m_groupcount_;
\r
340 int msb = getCodepointMSB(codepoint);
\r
342 // binary search for the group of names that contains the one for
\r
344 // find the group that contains codepoint, or the highest before it
\r
345 while (result < endGroup - 1) {
\r
346 int gindex = (result + endGroup) >> 1;
\r
347 if (msb < getGroupMSB(gindex)) {
\r
358 * Gets the extended and 1.0 name when the most current unicode names
\r
360 * @param ch codepoint
\r
361 * @return name of codepoint extended or 1.0
\r
363 public String getExtendedOr10Name(int ch)
\r
365 String result = null;
\r
366 if (getType(ch) == UCharacterCategory.CONTROL) {
\r
367 result = getName(ch,
\r
368 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
370 if (result == null) {
\r
371 int type = getType(ch);
\r
372 // Return unknown if the table of names above is not up to
\r
374 if (type >= TYPE_NAMES_.length) {
\r
375 result = UNKNOWN_TYPE_NAME_;
\r
378 result = TYPE_NAMES_[type];
\r
380 synchronized (m_utilStringBuffer_) {
\r
381 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
382 m_utilStringBuffer_.append('<');
\r
383 m_utilStringBuffer_.append(result);
\r
384 m_utilStringBuffer_.append('-');
\r
385 String chStr = Integer.toHexString(ch).toUpperCase();
\r
386 int zeros = 4 - chStr.length();
\r
387 while (zeros > 0) {
\r
388 m_utilStringBuffer_.append('0');
\r
391 m_utilStringBuffer_.append(chStr);
\r
392 m_utilStringBuffer_.append('>');
\r
393 result = m_utilStringBuffer_.toString();
\r
400 * Gets the MSB from the group index
\r
401 * @param gindex group index
\r
402 * @return the MSB of the group if gindex is valid, -1 otherwise
\r
404 public int getGroupMSB(int gindex)
\r
406 if (gindex >= m_groupcount_) {
\r
409 return m_groupinfo_[gindex * m_groupsize_];
\r
413 * Gets the MSB of the codepoint
\r
414 * @param codepoint The codepoint value.
\r
415 * @return the MSB of the codepoint
\r
417 public static int getCodepointMSB(int codepoint)
\r
419 return codepoint >> GROUP_SHIFT_;
\r
423 * Gets the maximum codepoint + 1 of the group
\r
424 * @param msb most significant byte of the group
\r
425 * @return limit codepoint of the group
\r
427 public static int getGroupLimit(int msb)
\r
429 return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
\r
433 * Gets the minimum codepoint of the group
\r
434 * @param msb most significant byte of the group
\r
435 * @return minimum codepoint of the group
\r
437 public static int getGroupMin(int msb)
\r
439 return msb << GROUP_SHIFT_;
\r
443 * Gets the offset to a group
\r
444 * @param codepoint The codepoint value.
\r
445 * @return offset to a group
\r
447 public static int getGroupOffset(int codepoint)
\r
449 return codepoint & GROUP_MASK_;
\r
453 * Gets the minimum codepoint of a group
\r
454 * @param codepoint The codepoint value.
\r
455 * @return minimum codepoint in the group which codepoint belongs to
\r
458 public static int getGroupMinFromCodepoint(int codepoint)
\r
460 return codepoint & ~GROUP_MASK_;
\r
465 * Get the Algorithm range length
\r
466 * @return Algorithm range length
\r
468 public int getAlgorithmLength()
\r
470 return m_algorithm_.length;
\r
474 * Gets the start of the range
\r
475 * @param index algorithm index
\r
476 * @return algorithm range start
\r
478 public int getAlgorithmStart(int index)
\r
480 return m_algorithm_[index].m_rangestart_;
\r
484 * Gets the end of the range
\r
485 * @param index algorithm index
\r
486 * @return algorithm range end
\r
488 public int getAlgorithmEnd(int index)
\r
490 return m_algorithm_[index].m_rangeend_;
\r
494 * Gets the Algorithmic name of the codepoint
\r
495 * @param index algorithmic range index
\r
496 * @param codepoint The codepoint value.
\r
497 * @return algorithmic name of codepoint
\r
499 public String getAlgorithmName(int index, int codepoint)
\r
501 String result = null;
\r
502 synchronized (m_utilStringBuffer_) {
\r
503 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
504 m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_);
\r
505 result = m_utilStringBuffer_.toString();
\r
511 * Gets the group name of the character
\r
512 * @param ch character to get the group name
\r
513 * @param choice name choice selector to choose a unicode 1.0 or newer name
\r
515 public synchronized String getGroupName(int ch, int choice)
\r
518 int msb = getCodepointMSB(ch);
\r
519 int group = getGroup(ch);
\r
521 // return this if it is an exact match
\r
522 if (msb == m_groupinfo_[group * m_groupsize_]) {
\r
523 int index = getGroupLengths(group, m_groupoffsets_,
\r
525 int offset = ch & GROUP_MASK_;
\r
526 return getGroupName(index + m_groupoffsets_[offset],
\r
527 m_grouplengths_[offset], choice);
\r
533 // these are transliterator use methods ---------------------------------
\r
536 * Gets the maximum length of any codepoint name.
\r
537 * Equivalent to uprv_getMaxCharNameLength.
\r
538 * @return the maximum length of any codepoint name
\r
540 public int getMaxCharNameLength()
\r
542 if (initNameSetsLengths()) {
\r
543 return m_maxNameLength_;
\r
551 * Gets the maximum length of any iso comments.
\r
552 * Equivalent to uprv_getMaxISOCommentLength.
\r
553 * @return the maximum length of any codepoint name
\r
556 public int getMaxISOCommentLength()
\r
558 if (initNameSetsLengths()) {
\r
559 return m_maxISOCommentLength_;
\r
568 * Fills set with characters that are used in Unicode character names.
\r
569 * Equivalent to uprv_getCharNameCharacters.
\r
570 * @param set USet to receive characters. Existing contents are deleted.
\r
572 public void getCharNameCharacters(UnicodeSet set)
\r
574 convert(m_nameSet_, set);
\r
578 * Fills set with characters that are used in Unicode character names.
\r
579 * Equivalent to uprv_getISOCommentCharacters.
\r
580 * @param set USet to receive characters. Existing contents are deleted.
\r
583 public void getISOCommentCharacters(UnicodeSet set)
\r
585 convert(m_ISOCommentSet_, set);
\r
589 // package private inner class --------------------------------------
\r
592 * Algorithmic name class
\r
594 static final class AlgorithmName
\r
596 // package private data members ----------------------------------
\r
599 * Constant type value of the different AlgorithmName
\r
601 static final int TYPE_0_ = 0;
\r
602 static final int TYPE_1_ = 1;
\r
604 // package private constructors ----------------------------------
\r
613 // package private methods ---------------------------------------
\r
616 * Sets the information for accessing the algorithmic names
\r
617 * @param rangestart starting code point that lies within this name group
\r
618 * @param rangeend end code point that lies within this name group
\r
619 * @param type algorithm type. There's 2 kinds of algorithmic type. First
\r
620 * which uses code point as part of its name and the other uses
\r
621 * variant postfix strings
\r
622 * @param variant algorithmic variant
\r
623 * @return true if values are valid
\r
625 boolean setInfo(int rangestart, int rangeend, byte type, byte variant)
\r
627 if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend
\r
628 && rangeend <= UCharacter.MAX_VALUE &&
\r
629 (type == TYPE_0_ || type == TYPE_1_)) {
\r
630 m_rangestart_ = rangestart;
\r
631 m_rangeend_ = rangeend;
\r
633 m_variant_ = variant;
\r
640 * Sets the factor data
\r
641 * @param factor Array of factor
\r
642 * @return true if factors are valid
\r
644 boolean setFactor(char factor[])
\r
646 if (factor.length == m_variant_) {
\r
647 m_factor_ = factor;
\r
654 * Sets the name prefix
\r
656 * @return true if prefix is set
\r
658 boolean setPrefix(String prefix)
\r
660 if (prefix != null && prefix.length() > 0) {
\r
661 m_prefix_ = prefix;
\r
668 * Sets the variant factorized name data
\r
669 * @param string variant factorized name data
\r
670 * @return true if values are set
\r
672 boolean setFactorString(byte string[])
\r
674 // factor and variant string can be empty for things like
\r
675 // hanggul code points
\r
676 m_factorstring_ = string;
\r
681 * Checks if code point lies in Algorithm object at index
\r
682 * @param ch code point
\r
684 boolean contains(int ch)
\r
686 return m_rangestart_ <= ch && ch <= m_rangeend_;
\r
690 * Appends algorithm name of code point into StringBuffer.
\r
691 * Note this method does not check for validity of code point in Algorithm,
\r
692 * result is undefined if code point does not belong in Algorithm.
\r
693 * @param ch code point
\r
694 * @param str StringBuffer to append to
\r
696 void appendName(int ch, StringBuffer str)
\r
698 str.append(m_prefix_);
\r
702 // prefix followed by hex digits indicating variants
\r
703 str.append(Utility.hex(ch,m_variant_));
\r
706 // prefix followed by factorized-elements
\r
707 int offset = ch - m_rangestart_;
\r
708 int indexes[] = m_utilIntBuffer_;
\r
711 // write elements according to the factors
\r
712 // the factorized elements are determined by modulo
\r
714 synchronized (m_utilIntBuffer_) {
\r
715 for (int i = m_variant_ - 1; i > 0; i --)
\r
717 factor = m_factor_[i] & 0x00FF;
\r
718 indexes[i] = offset % factor;
\r
722 // we don't need to calculate the last modulus because
\r
723 // start <= code <= end guarantees here that
\r
724 // code <= factors[0]
\r
725 indexes[0] = offset;
\r
727 // joining up the factorized strings
\r
728 str.append(getFactorString(indexes, m_variant_));
\r
735 * Gets the character for the argument algorithmic name
\r
736 * @return the algorithmic char or -1 otherwise.
\r
738 int getChar(String name)
\r
740 int prefixlen = m_prefix_.length();
\r
741 if (name.length() < prefixlen ||
\r
742 !m_prefix_.equals(name.substring(0, prefixlen))) {
\r
751 int result = Integer.parseInt(name.substring(prefixlen),
\r
753 // does it fit into the range?
\r
754 if (m_rangestart_ <= result && result <= m_rangeend_) {
\r
758 catch (NumberFormatException e)
\r
764 // repetitative suffix name comparison done here
\r
765 // offset is the character code - start
\r
766 for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++)
\r
768 int offset = ch - m_rangestart_;
\r
769 int indexes[] = m_utilIntBuffer_;
\r
772 // write elements according to the factors
\r
773 // the factorized elements are determined by modulo
\r
775 synchronized (m_utilIntBuffer_) {
\r
776 for (int i = m_variant_ - 1; i > 0; i --)
\r
778 factor = m_factor_[i] & 0x00FF;
\r
779 indexes[i] = offset % factor;
\r
783 // we don't need to calculate the last modulus
\r
784 // because start <= code <= end guarantees here that
\r
785 // code <= factors[0]
\r
786 indexes[0] = offset;
\r
788 // joining up the factorized strings
\r
789 if (compareFactorString(indexes, m_variant_, name,
\r
801 * Adds all chars in the set of algorithmic names into the set.
\r
802 * Equivalent to part of calcAlgNameSetsLengths.
\r
803 * @param set int set to add the chars of the algorithm names into
\r
804 * @param maxlength maximum length to compare to
\r
805 * @return the length that is either maxlength of the length of this
\r
806 * algorithm name if it is longer than maxlength
\r
808 int add(int set[], int maxlength)
\r
811 int length = UCharacterName.add(set, m_prefix_);
\r
814 // name = prefix + (range->variant times) hex-digits
\r
816 length += m_variant_;
\r
818 * addString(set, (const char *)(range + 1))
\r
819 + range->variant;*/
\r
823 // name = prefix factorized-elements
\r
824 // get the set and maximum factor suffix length for each
\r
826 for (int i = m_variant_ - 1; i > 0; i --)
\r
828 int maxfactorlength = 0;
\r
830 for (int factor = m_factor_[i]; factor > 0; -- factor) {
\r
831 synchronized (m_utilStringBuffer_) {
\r
832 m_utilStringBuffer_.delete(0,
\r
833 m_utilStringBuffer_.length());
\r
835 = UCharacterUtility.getNullTermByteSubString(
\r
836 m_utilStringBuffer_,
\r
837 m_factorstring_, count);
\r
838 UCharacterName.add(set, m_utilStringBuffer_);
\r
839 if (m_utilStringBuffer_.length()
\r
843 = m_utilStringBuffer_.length();
\r
847 length += maxfactorlength;
\r
851 if (length > maxlength) {
\r
857 // private data members ------------------------------------------
\r
860 * Algorithmic data information
\r
862 private int m_rangestart_;
\r
863 private int m_rangeend_;
\r
864 private byte m_type_;
\r
865 private byte m_variant_;
\r
866 private char m_factor_[];
\r
867 private String m_prefix_;
\r
868 private byte m_factorstring_[];
\r
870 * Utility StringBuffer
\r
872 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
\r
874 * Utility int buffer
\r
876 private int m_utilIntBuffer_[] = new int[256];
\r
878 // private methods -----------------------------------------------
\r
881 * Gets the indexth string in each of the argument factor block
\r
882 * @param index array with each index corresponding to each factor block
\r
883 * @param length length of the array index
\r
884 * @return the combined string of the array of indexth factor string in
\r
887 private String getFactorString(int index[], int length)
\r
889 int size = m_factor_.length;
\r
890 if (index == null || length != size) {
\r
894 synchronized (m_utilStringBuffer_) {
\r
895 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
899 for (int i = 0; i <= size; i ++) {
\r
900 factor = m_factor_[i];
\r
901 count = UCharacterUtility.skipNullTermByteSubString(
\r
902 m_factorstring_, count, index[i]);
\r
903 count = UCharacterUtility.getNullTermByteSubString(
\r
904 m_utilStringBuffer_, m_factorstring_,
\r
907 count = UCharacterUtility.skipNullTermByteSubString(
\r
908 m_factorstring_, count,
\r
909 factor - index[i] - 1);
\r
912 return m_utilStringBuffer_.toString();
\r
917 * Compares the indexth string in each of the argument factor block with
\r
918 * the argument string
\r
919 * @param index array with each index corresponding to each factor block
\r
920 * @param length index array length
\r
921 * @param str string to compare with
\r
922 * @param offset of str to start comparison
\r
923 * @return true if string matches
\r
925 private boolean compareFactorString(int index[], int length, String str,
\r
928 int size = m_factor_.length;
\r
929 if (index == null || length != size)
\r
933 int strcount = offset;
\r
936 for (int i = 0; i <= size; i ++)
\r
938 factor = m_factor_[i];
\r
939 count = UCharacterUtility.skipNullTermByteSubString(
\r
940 m_factorstring_, count, index[i]);
\r
941 strcount = UCharacterUtility.compareNullTermByteSubString(str,
\r
942 m_factorstring_, strcount, count);
\r
943 if (strcount < 0) {
\r
948 count = UCharacterUtility.skipNullTermByteSubString(
\r
949 m_factorstring_, count, factor - index[i]);
\r
952 if (strcount != str.length()) {
\r
959 // package private data members --------------------------------------
\r
962 * Size of each groups
\r
964 int m_groupsize_ = 0;
\r
966 // package private methods --------------------------------------------
\r
969 * Sets the token data
\r
970 * @param token array of tokens
\r
971 * @param tokenstring array of string values of the tokens
\r
972 * @return false if there is a data error
\r
974 boolean setToken(char token[], byte tokenstring[])
\r
976 if (token != null && tokenstring != null && token.length > 0 &&
\r
977 tokenstring.length > 0) {
\r
978 m_tokentable_ = token;
\r
979 m_tokenstring_ = tokenstring;
\r
986 * Set the algorithm name information array
\r
987 * @param alg Algorithm information array
\r
988 * @return true if the group string offset has been set correctly
\r
990 boolean setAlgorithm(AlgorithmName alg[])
\r
992 if (alg != null && alg.length != 0) {
\r
993 m_algorithm_ = alg;
\r
1000 * Sets the number of group and size of each group in number of char
\r
1001 * @param count number of groups
\r
1002 * @param size size of group in char
\r
1003 * @return true if group size is set correctly
\r
1005 boolean setGroupCountSize(int count, int size)
\r
1007 if (count <= 0 || size <= 0) {
\r
1010 m_groupcount_ = count;
\r
1011 m_groupsize_ = size;
\r
1016 * Sets the group name data
\r
1017 * @param group index information array
\r
1018 * @param groupstring name information array
\r
1019 * @return false if there is a data error
\r
1021 boolean setGroup(char group[], byte groupstring[])
\r
1023 if (group != null && groupstring != null && group.length > 0 &&
\r
1024 groupstring.length > 0) {
\r
1025 m_groupinfo_ = group;
\r
1026 m_groupstring_ = groupstring;
\r
1032 // private data members ----------------------------------------------
\r
1035 * Data used in unames.icu
\r
1037 private char m_tokentable_[];
\r
1038 private byte m_tokenstring_[];
\r
1039 private char m_groupinfo_[];
\r
1040 private byte m_groupstring_[];
\r
1041 private AlgorithmName m_algorithm_[];
\r
1044 * Group use. Note - access must be synchronized.
\r
1046 private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
\r
1047 private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
\r
1050 * Default name of the name datafile
\r
1052 private static final String NAME_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/unames.icu";
\r
1054 * Shift count to retrieve group information
\r
1056 private static final int GROUP_SHIFT_ = 5;
\r
1058 * Mask to retrieve the offset for a particular character within a group
\r
1060 private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
\r
1062 * Default buffer size of datafile
\r
1064 private static final int NAME_BUFFER_SIZE_ = 100000;
\r
1067 * Position of offsethigh in group information array
\r
1069 private static final int OFFSET_HIGH_OFFSET_ = 1;
\r
1072 * Position of offsetlow in group information array
\r
1074 private static final int OFFSET_LOW_OFFSET_ = 2;
\r
1076 * Double nibble indicator, any nibble > this number has to be combined
\r
1077 * with its following nibble
\r
1079 private static final int SINGLE_NIBBLE_MAX_ = 11;
\r
1082 * Maximum length of character names (regular & 1.0).
\r
1084 //private static int MAX_NAME_LENGTH_ = 0;
\r
1086 * Maximum length of ISO comments.
\r
1088 //private static int MAX_ISO_COMMENT_LENGTH_ = 0;
\r
1091 * Set of chars used in character names (regular & 1.0).
\r
1092 * Chars are platform-dependent (can be EBCDIC).
\r
1094 private int m_nameSet_[] = new int[8];
\r
1096 * Set of chars used in ISO comments. (regular & 1.0).
\r
1097 * Chars are platform-dependent (can be EBCDIC).
\r
1099 private int m_ISOCommentSet_[] = new int[8];
\r
1101 * Utility StringBuffer
\r
1103 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
\r
1105 * Utility int buffer
\r
1107 private int m_utilIntBuffer_[] = new int[2];
\r
1109 * Maximum ISO comment length
\r
1111 private int m_maxISOCommentLength_;
\r
1113 * Maximum name length
\r
1115 private int m_maxNameLength_;
\r
1117 * Type names used for extended names
\r
1119 private static final String TYPE_NAMES_[] = {"unassigned",
\r
1120 "uppercase letter",
\r
1121 "lowercase letter",
\r
1122 "titlecase letter",
\r
1123 "modifier letter",
\r
1125 "non spacing mark",
\r
1127 "combining spacing mark",
\r
1128 "decimal digit number",
\r
1131 "space separator",
\r
1133 "paragraph separator",
\r
1136 "private use area",
\r
1138 "dash punctuation",
\r
1139 "start punctuation",
\r
1140 "end punctuation",
\r
1141 "connector punctuation",
\r
1142 "other punctuation",
\r
1144 "currency symbol",
\r
1145 "modifier symbol",
\r
1147 "initial punctuation",
\r
1148 "final punctuation",
\r
1151 "trail surrogate"};
\r
1153 * Unknown type name
\r
1155 private static final String UNKNOWN_TYPE_NAME_ = "unknown";
\r
1157 * Not a character type
\r
1159 private static final int NON_CHARACTER_
\r
1160 = UCharacterCategory.CHAR_CATEGORY_COUNT;
\r
1162 * Lead surrogate type
\r
1164 private static final int LEAD_SURROGATE_
\r
1165 = UCharacterCategory.CHAR_CATEGORY_COUNT + 1;
\r
1167 * Trail surrogate type
\r
1169 private static final int TRAIL_SURROGATE_
\r
1170 = UCharacterCategory.CHAR_CATEGORY_COUNT + 2;
\r
1172 * Extended category count
\r
1174 static final int EXTENDED_CATEGORY_
\r
1175 = UCharacterCategory.CHAR_CATEGORY_COUNT + 3;
\r
1177 // private constructor ------------------------------------------------
\r
1180 * <p>Protected constructor for use in UCharacter.</p>
\r
1181 * @exception IOException thrown when data reading fails
\r
1183 private UCharacterName() throws IOException
\r
1185 InputStream is = ICUData.getRequiredStream(NAME_FILE_NAME_);
\r
1186 BufferedInputStream b = new BufferedInputStream(is, NAME_BUFFER_SIZE_);
\r
1187 UCharacterNameReader reader = new UCharacterNameReader(b);
\r
1188 reader.read(this);
\r
1192 // private methods ---------------------------------------------------
\r
1195 * Gets the algorithmic name for the argument character
\r
1196 * @param ch character to determine name for
\r
1197 * @param choice name choice
\r
1198 * @return the algorithmic name or null if not found
\r
1200 private String getAlgName(int ch, int choice)
\r
1202 /* Only the normative character name can be algorithmic. */
\r
1203 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME ||
\r
1204 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME
\r
1206 // index in terms integer index
\r
1207 synchronized (m_utilStringBuffer_) {
\r
1208 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
1210 for (int index = m_algorithm_.length - 1; index >= 0; index --)
\r
1212 if (m_algorithm_[index].contains(ch)) {
\r
1213 m_algorithm_[index].appendName(ch, m_utilStringBuffer_);
\r
1214 return m_utilStringBuffer_.toString();
\r
1223 * Getting the character with the tokenized argument name
\r
1224 * @param name of the character
\r
1225 * @return character with the tokenized argument name or -1 if character
\r
1228 private synchronized int getGroupChar(String name, int choice)
\r
1230 for (int i = 0; i < m_groupcount_; i ++) {
\r
1231 // populating the data set of grouptable
\r
1233 int startgpstrindex = getGroupLengths(i, m_groupoffsets_,
\r
1236 // shift out to function
\r
1237 int result = getGroupChar(startgpstrindex, m_grouplengths_, name,
\r
1239 if (result != -1) {
\r
1240 return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_)
\r
1248 * Compares and retrieve character if name is found within the argument
\r
1250 * @param index index where the set of names reside in the group block
\r
1251 * @param length list of lengths of the strings
\r
1252 * @param name character name to search for
\r
1253 * @param choice of either 1.0 or the most current unicode name
\r
1254 * @return relative character in the group which matches name, otherwise if
\r
1255 * not found, -1 will be returned
\r
1257 private int getGroupChar(int index, char length[], String name,
\r
1263 int namelen = name.length();
\r
1267 for (int result = 0; result <= LINES_PER_GROUP_; result ++) {
\r
1269 len = length[result];
\r
1271 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME &&
\r
1272 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME
\r
1275 * skip the modern name if it is not requested _and_
\r
1276 * if the semicolon byte value is a character, not a token number
\r
1278 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice;
\r
1280 int oldindex = index;
\r
1281 index += UCharacterUtility.skipByteSubString(m_groupstring_,
\r
1282 index, len, (byte)';');
\r
1283 len -= (index - oldindex);
\r
1284 } while(--fieldIndex>0);
\r
1287 // number of tokens is > the length of the name
\r
1288 // write each letter directly, and write a token word per token
\r
1289 for (count = 0; count < len && nindex != -1 && nindex < namelen;
\r
1291 b = m_groupstring_[index + count];
\r
1294 if (b >= m_tokentable_.length) {
\r
1295 if (name.charAt(nindex ++) != (b & 0xFF)) {
\r
1300 token = m_tokentable_[b & 0xFF];
\r
1301 if (token == 0xFFFE) {
\r
1302 // this is a lead byte for a double-byte token
\r
1303 token = m_tokentable_[b << 8 |
\r
1304 (m_groupstring_[index + count] & 0x00ff)];
\r
1307 if (token == 0xFFFF) {
\r
1308 if (name.charAt(nindex ++) != (b & 0xFF)) {
\r
1313 // compare token with name
\r
1314 nindex = UCharacterUtility.compareNullTermByteSubString(
\r
1315 name, m_tokenstring_, nindex, token);
\r
1320 if (namelen == nindex &&
\r
1321 (count == len || m_groupstring_[index + count] == ';')) {
\r
1331 * Gets the character extended type
\r
1332 * @param ch character to be tested
\r
1333 * @return extended type it is associated with
\r
1335 private static int getType(int ch)
\r
1337 if (UCharacterUtility.isNonCharacter(ch)) {
\r
1338 // not a character we return a invalid category count
\r
1339 return NON_CHARACTER_;
\r
1341 int result = UCharacter.getType(ch);
\r
1342 if (result == UCharacterCategory.SURROGATE) {
\r
1343 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
1344 result = LEAD_SURROGATE_;
\r
1347 result = TRAIL_SURROGATE_;
\r
1354 * Getting the character with extended name of the form <....>.
\r
1355 * @param name of the character to be found
\r
1356 * @param choice name choice
\r
1357 * @return character associated with the name, -1 if such character is not
\r
1358 * found and -2 if we should continue with the search.
\r
1360 private static int getExtendedChar(String name, int choice)
\r
1362 if (name.charAt(0) == '<') {
\r
1363 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
1364 int endIndex = name.length() - 1;
\r
1365 if (name.charAt(endIndex) == '>') {
\r
1366 int startIndex = name.lastIndexOf('-');
\r
1367 if (startIndex >= 0) { // We've got a category.
\r
1371 result = Integer.parseInt(
\r
1372 name.substring(startIndex, endIndex),
\r
1375 catch (NumberFormatException e) {
\r
1378 // Now validate the category name. We could use a
\r
1379 // binary search, or a trie, if we really wanted to.
\r
1380 String type = name.substring(1, startIndex - 1);
\r
1381 int length = TYPE_NAMES_.length;
\r
1382 for (int i = 0; i < length; ++ i) {
\r
1383 if (type.compareTo(TYPE_NAMES_[i]) == 0) {
\r
1384 if (getType(result) == i) {
\r
1398 // sets of name characters, maximum name lengths -----------------------
\r
1401 * Adds a codepoint into a set of ints.
\r
1402 * Equivalent to SET_ADD.
\r
1403 * @param set set to add to
\r
1404 * @param ch 16 bit char to add
\r
1406 private static void add(int set[], char ch)
\r
1408 set[ch >>> 5] |= 1 << (ch & 0x1f);
\r
1412 * Checks if a codepoint is a part of a set of ints.
\r
1413 * Equivalent to SET_CONTAINS.
\r
1414 * @param set set to check in
\r
1415 * @param ch 16 bit char to check
\r
1416 * @return true if codepoint is part of the set, false otherwise
\r
1418 private static boolean contains(int set[], char ch)
\r
1420 return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0;
\r
1424 * Adds all characters of the argument str and gets the length
\r
1425 * Equivalent to calcStringSetLength.
\r
1426 * @param set set to add all chars of str to
\r
1427 * @param str string to add
\r
1429 private static int add(int set[], String str)
\r
1431 int result = str.length();
\r
1433 for (int i = result - 1; i >= 0; i --) {
\r
1434 add(set, str.charAt(i));
\r
1440 * Adds all characters of the argument str and gets the length
\r
1441 * Equivalent to calcStringSetLength.
\r
1442 * @param set set to add all chars of str to
\r
1443 * @param str string to add
\r
1445 private static int add(int set[], StringBuffer str)
\r
1447 int result = str.length();
\r
1449 for (int i = result - 1; i >= 0; i --) {
\r
1450 add(set, str.charAt(i));
\r
1456 * Adds all algorithmic names into the name set.
\r
1457 * Equivalent to part of calcAlgNameSetsLengths.
\r
1458 * @param maxlength length to compare to
\r
1459 * @return the maximum length of any possible algorithmic name if it is >
\r
1460 * maxlength, otherwise maxlength is returned.
\r
1462 private int addAlgorithmName(int maxlength)
\r
1465 for (int i = m_algorithm_.length - 1; i >= 0; i --) {
\r
1466 result = m_algorithm_[i].add(m_nameSet_, maxlength);
\r
1467 if (result > maxlength) {
\r
1468 maxlength = result;
\r
1475 * Adds all extended names into the name set.
\r
1476 * Equivalent to part of calcExtNameSetsLengths.
\r
1477 * @param maxlength length to compare to
\r
1478 * @return the maxlength of any possible extended name.
\r
1480 private int addExtendedName(int maxlength)
\r
1482 for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) {
\r
1483 // for each category, count the length of the category name
\r
1487 // 6 for most hex digits per code point
\r
1488 int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]);
\r
1489 if (length > maxlength) {
\r
1490 maxlength = length;
\r
1497 * Adds names of a group to the argument set.
\r
1498 * Equivalent to calcNameSetLength.
\r
1499 * @param offset of the group name string in byte count
\r
1500 * @param length of the group name string
\r
1501 * @param tokenlength array to store the length of each token
\r
1502 * @param set to add to
\r
1503 * @return the length of the name string and the length of the group
\r
1506 private int[] addGroupName(int offset, int length, byte tokenlength[],
\r
1509 int resultnlength = 0;
\r
1510 int resultplength = 0;
\r
1511 while (resultplength < length) {
\r
1512 char b = (char)(m_groupstring_[offset + resultplength] & 0xff);
\r
1518 if (b >= m_tokentable_.length) {
\r
1519 add(set, b); // implicit letter
\r
1523 char token = m_tokentable_[b & 0x00ff];
\r
1524 if (token == 0xFFFE) {
\r
1525 // this is a lead byte for a double-byte token
\r
1526 b = (char)(b << 8 | (m_groupstring_[offset + resultplength]
\r
1528 token = m_tokentable_[b];
\r
1531 if (token == 0xFFFF) {
\r
1536 // count token word
\r
1537 // use cached token length
\r
1538 byte tlength = tokenlength[b];
\r
1539 if (tlength == 0) {
\r
1540 synchronized (m_utilStringBuffer_) {
\r
1541 m_utilStringBuffer_.delete(0,
\r
1542 m_utilStringBuffer_.length());
\r
1543 UCharacterUtility.getNullTermByteSubString(
\r
1544 m_utilStringBuffer_, m_tokenstring_,
\r
1546 tlength = (byte)add(set, m_utilStringBuffer_);
\r
1548 tokenlength[b] = tlength;
\r
1550 resultnlength += tlength;
\r
1554 m_utilIntBuffer_[0] = resultnlength;
\r
1555 m_utilIntBuffer_[1] = resultplength;
\r
1556 return m_utilIntBuffer_;
\r
1560 * Adds names of all group to the argument set.
\r
1561 * Sets the data member m_max*Length_.
\r
1562 * Method called only once.
\r
1563 * Equivalent to calcGroupNameSetsLength.
\r
1564 * @param maxlength length to compare to
\r
1566 private void addGroupName(int maxlength)
\r
1568 int maxisolength = 0;
\r
1569 char offsets[] = new char[LINES_PER_GROUP_ + 2];
\r
1570 char lengths[] = new char[LINES_PER_GROUP_ + 2];
\r
1571 byte tokenlengths[] = new byte[m_tokentable_.length];
\r
1573 // enumerate all groups
\r
1574 // for (int i = m_groupcount_ - 1; i >= 0; i --) {
\r
1575 for (int i = 0; i < m_groupcount_ ; i ++) {
\r
1576 int offset = getGroupLengths(i, offsets, lengths);
\r
1577 // enumerate all lines in each group
\r
1578 // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0;
\r
1579 // linenumber --) {
\r
1580 for (int linenumber = 0; linenumber < LINES_PER_GROUP_;
\r
1582 int lineoffset = offset + offsets[linenumber];
\r
1583 int length = lengths[linenumber];
\r
1584 if (length == 0) {
\r
1588 // read regular name
\r
1589 int parsed[] = addGroupName(lineoffset, length, tokenlengths,
\r
1591 if (parsed[0] > maxlength) {
\r
1592 // 0 for name length
\r
1593 maxlength = parsed[0];
\r
1595 lineoffset += parsed[1];
\r
1596 if (parsed[1] >= length) {
\r
1597 // 1 for parsed group string length
\r
1600 length -= parsed[1];
\r
1601 // read Unicode 1.0 name
\r
1602 parsed = addGroupName(lineoffset, length, tokenlengths,
\r
1604 if (parsed[0] > maxlength) {
\r
1605 // 0 for name length
\r
1606 maxlength = parsed[0];
\r
1608 lineoffset += parsed[1];
\r
1609 if (parsed[1] >= length) {
\r
1610 // 1 for parsed group string length
\r
1613 length -= parsed[1];
\r
1614 // read ISO comment
\r
1615 parsed = addGroupName(lineoffset, length, tokenlengths,
\r
1616 m_ISOCommentSet_);
\r
1617 if (parsed[1] > maxisolength) {
\r
1618 maxisolength = length;
\r
1623 // set gMax... - name length last for threading
\r
1624 m_maxISOCommentLength_ = maxisolength;
\r
1625 m_maxNameLength_ = maxlength;
\r
1629 * Sets up the name sets and the calculation of the maximum lengths.
\r
1630 * Equivalent to calcNameSetsLengths.
\r
1632 private boolean initNameSetsLengths()
\r
1634 if (m_maxNameLength_ > 0) {
\r
1638 String extra = "0123456789ABCDEF<>-";
\r
1639 // set hex digits, used in various names, and <>-, used in extended
\r
1641 for (int i = extra.length() - 1; i >= 0; i --) {
\r
1642 add(m_nameSet_, extra.charAt(i));
\r
1645 // set sets and lengths from algorithmic names
\r
1646 m_maxNameLength_ = addAlgorithmName(0);
\r
1647 // set sets and lengths from extended names
\r
1648 m_maxNameLength_ = addExtendedName(m_maxNameLength_);
\r
1649 // set sets and lengths from group names, set global maximum values
\r
1650 addGroupName(m_maxNameLength_);
\r
1655 * Converts the char set cset into a Unicode set uset.
\r
1656 * Equivalent to charSetToUSet.
\r
1657 * @param set Set of 256 bit flags corresponding to a set of chars.
\r
1658 * @param uset USet to receive characters. Existing contents are deleted.
\r
1660 private void convert(int set[], UnicodeSet uset)
\r
1663 if (!initNameSetsLengths()) {
\r
1667 // build a char string with all chars that are used in character names
\r
1668 for (char c = 255; c > 0; c --) {
\r
1669 if (contains(set, c)) {
\r