2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.InputStream;
\r
10 import java.io.BufferedInputStream;
\r
11 import java.io.IOException;
\r
12 import java.util.MissingResourceException;
\r
14 import com.ibm.icu.text.UTF16;
\r
15 import com.ibm.icu.text.UnicodeSet;
\r
16 import com.ibm.icu.lang.UCharacter;
\r
17 import com.ibm.icu.lang.UCharacterCategory;
\r
20 * Internal class to manage character names.
\r
21 * Since data for names are stored
\r
22 * in an array of char, by default indexes used in this class is refering to
\r
23 * a 2 byte count, unless otherwise stated. Cases where the index is refering
\r
24 * to a byte count, the index is halved and depending on whether the index is
\r
25 * even or odd, the MSB or LSB of the result char at the halved index is
\r
26 * returned. For indexes to an array of int, the index is multiplied by 2,
\r
27 * result char at the multiplied index and its following char is returned as an
\r
29 * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class
\r
30 * Note : 0 - 0x1F are control characters without names in Unicode 3.0
\r
31 * @author Syn Wee Quek
\r
35 public final class UCharacterName
\r
37 // public data members ----------------------------------------------
\r
40 * Number of lines per group
\r
43 public static final int LINES_PER_GROUP_ = 1 << 5;
\r
45 * Maximum number of groups
\r
47 public int m_groupcount_ = 0;
\r
49 // public methods ---------------------------------------------------
\r
52 * Gets the only instance of UCharacterName
\r
53 * @return only instance of UCharacterName
\r
54 * @exception MissingResourceException thrown when reading of name data fails
\r
56 public static UCharacterName getInstance()
\r
58 if (INSTANCE_ == null) {
\r
60 INSTANCE_ = new UCharacterName();
\r
61 }catch(IOException e){
\r
62 throw new MissingResourceException("Could not construct UCharacterName. Missing unames.icu","","");
\r
64 catch (Exception e) {
\r
65 throw new MissingResourceException(e.getMessage(),"","");
\r
72 * Retrieve the name of a Unicode code point.
\r
73 * Depending on <code>choice</code>, the character name written into the
\r
74 * buffer is the "modern" name or the name that was defined in Unicode
\r
76 * The name contains only "invariant" characters
\r
77 * like A-Z, 0-9, space, and '-'.
\r
79 * @param ch the code point for which to get the name.
\r
80 * @param choice Selector for which name to get.
\r
81 * @return if code point is above 0x1fff, null is returned
\r
83 public String getName(int ch, int choice)
\r
85 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE ||
\r
86 choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) {
\r
90 String result = null;
\r
92 result = getAlgName(ch, choice);
\r
94 // getting normal character name
\r
95 if (result == null || result.length() == 0) {
\r
96 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
97 result = getExtendedName(ch);
\r
99 result = getGroupName(ch, choice);
\r
107 * Find a character by its name and return its code point value
\r
108 * @param choice selector to indicate if argument name is a Unicode 1.0
\r
109 * or the most current version
\r
110 * @param name the name to search for
\r
111 * @return code point
\r
113 public int getCharFromName(int choice, String name)
\r
115 // checks for illegal arguments
\r
116 if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT ||
\r
117 name == null || name.length() == 0) {
\r
121 // try extended names first
\r
122 int result = getExtendedChar(name.toLowerCase(), choice);
\r
123 if (result >= -1) {
\r
127 String upperCaseName = name.toUpperCase();
\r
128 // try algorithmic names first, if fails then try group names
\r
129 // int result = getAlgorithmChar(choice, uppercasename);
\r
131 if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
\r
133 if (m_algorithm_ != null) {
\r
134 count = m_algorithm_.length;
\r
136 for (count --; count >= 0; count --) {
\r
137 result = m_algorithm_[count].getChar(upperCaseName);
\r
144 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
145 result = getGroupChar(upperCaseName,
\r
146 UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
147 if (result == -1) {
\r
148 result = getGroupChar(upperCaseName,
\r
149 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
153 result = getGroupChar(upperCaseName, choice);
\r
158 // these are all UCharacterNameIterator use methods -------------------
\r
161 * Reads a block of compressed lengths of 32 strings and expands them into
\r
162 * offsets and lengths for each string. Lengths are stored with a
\r
163 * variable-width encoding in consecutive nibbles:
\r
164 * If a nibble<0xc, then it is the length itself (0 = empty string).
\r
165 * If a nibble>=0xc, then it forms a length value with the following
\r
167 * The offsets and lengths arrays must be at least 33 (one more) long
\r
168 * because there is no check here at the end if the last nibble is still
\r
170 * @param index of group string object in array
\r
171 * @param offsets array to store the value of the string offsets
\r
172 * @param lengths array to store the value of the string length
\r
173 * @return next index of the data string immediately after the lengths
\r
174 * in terms of byte address
\r
176 public int getGroupLengths(int index, char offsets[], char lengths[])
\r
178 char length = 0xffff;
\r
182 index = index * m_groupsize_; // byte count offsets of group strings
\r
183 int stringoffset = UCharacterUtility.toInt(
\r
184 m_groupinfo_[index + OFFSET_HIGH_OFFSET_],
\r
185 m_groupinfo_[index + OFFSET_LOW_OFFSET_]);
\r
189 // all 32 lengths must be read to get the offset of the first group
\r
191 for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) {
\r
192 b = m_groupstring_[stringoffset];
\r
195 while (shift >= 0) {
\r
197 n = (byte)((b >> shift) & 0x0F);
\r
198 if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
\r
199 length = (char)((n - 12) << 4);
\r
202 if (length != 0xffff) {
\r
203 lengths[i] = (char)((length | n) + 12);
\r
206 lengths[i] = (char)n;
\r
209 if (i < LINES_PER_GROUP_) {
\r
210 offsets[i + 1] = (char)(offsets[i] + lengths[i]);
\r
220 return stringoffset;
\r
224 * Gets the name of the argument group index.
\r
225 * UnicodeData.txt uses ';' as a field separator, so no field can contain
\r
226 * ';' as part of its contents. In unames.icu, it is marked as
\r
227 * token[';'] == -1 only if the semicolon is used in the data file - which
\r
228 * is iff we have Unicode 1.0 names or ISO comments.
\r
229 * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments
\r
230 * although we know that it will never be part of a name.
\r
231 * Equivalent to ICU4C's expandName.
\r
232 * @param index of the group name string in byte count
\r
233 * @param length of the group name string
\r
234 * @param choice of Unicode 1.0 name or the most current name
\r
235 * @return name of the group
\r
237 public String getGroupName(int index, int length, int choice)
\r
239 if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME
\r
240 || choice == UCharacterNameChoice.ISO_COMMENT_) {
\r
241 if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) {
\r
242 // skip the modern name
\r
243 int oldindex = index;
\r
244 index += UCharacterUtility.skipByteSubString(m_groupstring_,
\r
245 index, length, (byte)';');
\r
246 length -= (index - oldindex);
\r
247 if (choice == UCharacterNameChoice.ISO_COMMENT_) {
\r
248 // skips the 1.0 Name to the iso comment part
\r
250 index += UCharacterUtility.skipByteSubString(m_groupstring_,
\r
251 index, length, (byte)';');
\r
252 length -= (index - oldindex);
\r
256 // the semicolon byte is a token number, therefore only modern
\r
257 // names are stored in unames.dat and there is no such
\r
258 // requested Unicode 1.0 name here
\r
263 synchronized (m_utilStringBuffer_) {
\r
264 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
267 for (int i = 0; i < length;) {
\r
268 b = m_groupstring_[index + i];
\r
271 if (b >= m_tokentable_.length) {
\r
275 m_utilStringBuffer_.append(b); // implicit letter
\r
278 token = m_tokentable_[b & 0x00ff];
\r
279 if (token == 0xFFFE) {
\r
280 // this is a lead byte for a double-byte token
\r
281 token = m_tokentable_[b << 8 |
\r
282 (m_groupstring_[index + i] & 0x00ff)];
\r
285 if (token == 0xFFFF) {
\r
287 // skip the semicolon if we are seeking extended
\r
288 // names and there was no 2.0 name but there
\r
290 if (m_utilStringBuffer_.length() == 0 && choice ==
\r
291 UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
297 m_utilStringBuffer_.append((char)(b & 0x00ff));
\r
299 else { // write token word
\r
300 UCharacterUtility.getNullTermByteSubString(
\r
301 m_utilStringBuffer_, m_tokenstring_, token);
\r
306 if (m_utilStringBuffer_.length() > 0) {
\r
307 return m_utilStringBuffer_.toString();
\r
314 * Retrieves the extended name
\r
316 public String getExtendedName(int ch)
\r
318 String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME);
\r
319 if (result == null) {
\r
320 if (getType(ch) == UCharacterCategory.CONTROL) {
\r
321 result = getName(ch,
\r
322 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
324 if (result == null) {
\r
325 result = getExtendedOr10Name(ch);
\r
332 * Gets the group index for the codepoint, or the group before it.
\r
334 * @return group index containing codepoint or the group before it.
\r
336 public int getGroup(int codepoint)
\r
338 int endGroup = m_groupcount_;
\r
339 int msb = getCodepointMSB(codepoint);
\r
341 // binary search for the group of names that contains the one for
\r
343 // find the group that contains codepoint, or the highest before it
\r
344 while (result < endGroup - 1) {
\r
345 int gindex = (result + endGroup) >> 1;
\r
346 if (msb < getGroupMSB(gindex)) {
\r
357 * Gets the extended and 1.0 name when the most current unicode names
\r
359 * @param ch codepoint
\r
360 * @return name of codepoint extended or 1.0
\r
362 public String getExtendedOr10Name(int ch)
\r
364 String result = null;
\r
365 if (getType(ch) == UCharacterCategory.CONTROL) {
\r
366 result = getName(ch,
\r
367 UCharacterNameChoice.UNICODE_10_CHAR_NAME);
\r
369 if (result == null) {
\r
370 int type = getType(ch);
\r
371 // Return unknown if the table of names above is not up to
\r
373 if (type >= TYPE_NAMES_.length) {
\r
374 result = UNKNOWN_TYPE_NAME_;
\r
377 result = TYPE_NAMES_[type];
\r
379 synchronized (m_utilStringBuffer_) {
\r
380 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
381 m_utilStringBuffer_.append('<');
\r
382 m_utilStringBuffer_.append(result);
\r
383 m_utilStringBuffer_.append('-');
\r
384 String chStr = Integer.toHexString(ch).toUpperCase();
\r
385 int zeros = 4 - chStr.length();
\r
386 while (zeros > 0) {
\r
387 m_utilStringBuffer_.append('0');
\r
390 m_utilStringBuffer_.append(chStr);
\r
391 m_utilStringBuffer_.append('>');
\r
392 result = m_utilStringBuffer_.toString();
\r
399 * Gets the MSB from the group index
\r
400 * @param gindex group index
\r
401 * @return the MSB of the group if gindex is valid, -1 otherwise
\r
403 public int getGroupMSB(int gindex)
\r
405 if (gindex >= m_groupcount_) {
\r
408 return m_groupinfo_[gindex * m_groupsize_];
\r
412 * Gets the MSB of the codepoint
\r
414 * @return the MSB of the codepoint
\r
416 public static int getCodepointMSB(int codepoint)
\r
418 return codepoint >> GROUP_SHIFT_;
\r
422 * Gets the maximum codepoint + 1 of the group
\r
423 * @param msb most significant byte of the group
\r
424 * @return limit codepoint of the group
\r
426 public static int getGroupLimit(int msb)
\r
428 return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
\r
432 * Gets the minimum codepoint of the group
\r
433 * @param msb most significant byte of the group
\r
434 * @return minimum codepoint of the group
\r
436 public static int getGroupMin(int msb)
\r
438 return msb << GROUP_SHIFT_;
\r
442 * Gets the offset to a group
\r
444 * @return offset to a group
\r
446 public static int getGroupOffset(int codepoint)
\r
448 return codepoint & GROUP_MASK_;
\r
452 * Gets the minimum codepoint of a group
\r
454 * @return minimum codepoint in the group which codepoint belongs to
\r
457 public static int getGroupMinFromCodepoint(int codepoint)
\r
459 return codepoint & ~GROUP_MASK_;
\r
464 * Get the Algorithm range length
\r
465 * @return Algorithm range length
\r
467 public int getAlgorithmLength()
\r
469 return m_algorithm_.length;
\r
473 * Gets the start of the range
\r
474 * @param index algorithm index
\r
475 * @return algorithm range start
\r
477 public int getAlgorithmStart(int index)
\r
479 return m_algorithm_[index].m_rangestart_;
\r
483 * Gets the end of the range
\r
484 * @param index algorithm index
\r
485 * @return algorithm range end
\r
487 public int getAlgorithmEnd(int index)
\r
489 return m_algorithm_[index].m_rangeend_;
\r
493 * Gets the Algorithmic name of the codepoint
\r
494 * @param index algorithmic range index
\r
496 * @return algorithmic name of codepoint
\r
498 public String getAlgorithmName(int index, int codepoint)
\r
500 String result = null;
\r
501 synchronized (m_utilStringBuffer_) {
\r
502 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
503 m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_);
\r
504 result = m_utilStringBuffer_.toString();
\r
510 * Gets the group name of the character
\r
511 * @param ch character to get the group name
\r
512 * @param choice name choice selector to choose a unicode 1.0 or newer name
\r
514 public synchronized String getGroupName(int ch, int choice)
\r
517 int msb = getCodepointMSB(ch);
\r
518 int group = getGroup(ch);
\r
520 // return this if it is an exact match
\r
521 if (msb == m_groupinfo_[group * m_groupsize_]) {
\r
522 int index = getGroupLengths(group, m_groupoffsets_,
\r
524 int offset = ch & GROUP_MASK_;
\r
525 return getGroupName(index + m_groupoffsets_[offset],
\r
526 m_grouplengths_[offset], choice);
\r
532 // these are transliterator use methods ---------------------------------
\r
535 * Gets the maximum length of any codepoint name.
\r
536 * Equivalent to uprv_getMaxCharNameLength.
\r
537 * @return the maximum length of any codepoint name
\r
539 public int getMaxCharNameLength()
\r
541 if (initNameSetsLengths()) {
\r
542 return m_maxNameLength_;
\r
550 * Gets the maximum length of any iso comments.
\r
551 * Equivalent to uprv_getMaxISOCommentLength.
\r
552 * @return the maximum length of any codepoint name
\r
555 public int getMaxISOCommentLength()
\r
557 if (initNameSetsLengths()) {
\r
558 return m_maxISOCommentLength_;
\r
567 * Fills set with characters that are used in Unicode character names.
\r
568 * Equivalent to uprv_getCharNameCharacters.
\r
569 * @param set USet to receive characters. Existing contents are deleted.
\r
571 public void getCharNameCharacters(UnicodeSet set)
\r
573 convert(m_nameSet_, set);
\r
577 * Fills set with characters that are used in Unicode character names.
\r
578 * Equivalent to uprv_getISOCommentCharacters.
\r
579 * @param set USet to receive characters. Existing contents are deleted.
\r
582 public void getISOCommentCharacters(UnicodeSet set)
\r
584 convert(m_ISOCommentSet_, set);
\r
588 // package private inner class --------------------------------------
\r
591 * Algorithmic name class
\r
593 static final class AlgorithmName
\r
595 // package private data members ----------------------------------
\r
598 * Constant type value of the different AlgorithmName
\r
600 static final int TYPE_0_ = 0;
\r
601 static final int TYPE_1_ = 1;
\r
603 // package private constructors ----------------------------------
\r
612 // package private methods ---------------------------------------
\r
615 * Sets the information for accessing the algorithmic names
\r
616 * @param rangestart starting code point that lies within this name group
\r
617 * @param rangeend end code point that lies within this name group
\r
618 * @param type algorithm type. There's 2 kinds of algorithmic type. First
\r
619 * which uses code point as part of its name and the other uses
\r
620 * variant postfix strings
\r
621 * @param variant algorithmic variant
\r
622 * @return true if values are valid
\r
624 boolean setInfo(int rangestart, int rangeend, byte type, byte variant)
\r
626 if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend
\r
627 && rangeend <= UCharacter.MAX_VALUE &&
\r
628 (type == TYPE_0_ || type == TYPE_1_)) {
\r
629 m_rangestart_ = rangestart;
\r
630 m_rangeend_ = rangeend;
\r
632 m_variant_ = variant;
\r
639 * Sets the factor data
\r
640 * @param factor Array of factor
\r
641 * @return true if factors are valid
\r
643 boolean setFactor(char factor[])
\r
645 if (factor.length == m_variant_) {
\r
646 m_factor_ = factor;
\r
653 * Sets the name prefix
\r
655 * @return true if prefix is set
\r
657 boolean setPrefix(String prefix)
\r
659 if (prefix != null && prefix.length() > 0) {
\r
660 m_prefix_ = prefix;
\r
667 * Sets the variant factorized name data
\r
668 * @param string variant factorized name data
\r
669 * @return true if values are set
\r
671 boolean setFactorString(byte string[])
\r
673 // factor and variant string can be empty for things like
\r
674 // hanggul code points
\r
675 m_factorstring_ = string;
\r
680 * Checks if code point lies in Algorithm object at index
\r
681 * @param ch code point
\r
683 boolean contains(int ch)
\r
685 return m_rangestart_ <= ch && ch <= m_rangeend_;
\r
689 * Appends algorithm name of code point into StringBuffer.
\r
690 * Note this method does not check for validity of code point in Algorithm,
\r
691 * result is undefined if code point does not belong in Algorithm.
\r
692 * @param ch code point
\r
693 * @param str StringBuffer to append to
\r
695 void appendName(int ch, StringBuffer str)
\r
697 str.append(m_prefix_);
\r
701 // prefix followed by hex digits indicating variants
\r
702 Utility.hex(ch, m_variant_, str);
\r
705 // prefix followed by factorized-elements
\r
706 int offset = ch - m_rangestart_;
\r
707 int indexes[] = m_utilIntBuffer_;
\r
710 // write elements according to the factors
\r
711 // the factorized elements are determined by modulo
\r
713 synchronized (m_utilIntBuffer_) {
\r
714 for (int i = m_variant_ - 1; i > 0; i --)
\r
716 factor = m_factor_[i] & 0x00FF;
\r
717 indexes[i] = offset % factor;
\r
721 // we don't need to calculate the last modulus because
\r
722 // start <= code <= end guarantees here that
\r
723 // code <= factors[0]
\r
724 indexes[0] = offset;
\r
726 // joining up the factorized strings
\r
727 str.append(getFactorString(indexes, m_variant_));
\r
734 * Gets the character for the argument algorithmic name
\r
735 * @return the algorithmic char or -1 otherwise.
\r
737 int getChar(String name)
\r
739 int prefixlen = m_prefix_.length();
\r
740 if (name.length() < prefixlen ||
\r
741 !m_prefix_.equals(name.substring(0, prefixlen))) {
\r
750 int result = Integer.parseInt(name.substring(prefixlen),
\r
752 // does it fit into the range?
\r
753 if (m_rangestart_ <= result && result <= m_rangeend_) {
\r
757 catch (NumberFormatException e)
\r
763 // repetitative suffix name comparison done here
\r
764 // offset is the character code - start
\r
765 for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++)
\r
767 int offset = ch - m_rangestart_;
\r
768 int indexes[] = m_utilIntBuffer_;
\r
771 // write elements according to the factors
\r
772 // the factorized elements are determined by modulo
\r
774 synchronized (m_utilIntBuffer_) {
\r
775 for (int i = m_variant_ - 1; i > 0; i --)
\r
777 factor = m_factor_[i] & 0x00FF;
\r
778 indexes[i] = offset % factor;
\r
782 // we don't need to calculate the last modulus
\r
783 // because start <= code <= end guarantees here that
\r
784 // code <= factors[0]
\r
785 indexes[0] = offset;
\r
787 // joining up the factorized strings
\r
788 if (compareFactorString(indexes, m_variant_, name,
\r
800 * Adds all chars in the set of algorithmic names into the set.
\r
801 * Equivalent to part of calcAlgNameSetsLengths.
\r
802 * @param set int set to add the chars of the algorithm names into
\r
803 * @param maxlength maximum length to compare to
\r
804 * @return the length that is either maxlength of the length of this
\r
805 * algorithm name if it is longer than maxlength
\r
807 int add(int set[], int maxlength)
\r
810 int length = UCharacterName.add(set, m_prefix_);
\r
813 // name = prefix + (range->variant times) hex-digits
\r
815 length += m_variant_;
\r
817 * addString(set, (const char *)(range + 1))
\r
818 + range->variant;*/
\r
822 // name = prefix factorized-elements
\r
823 // get the set and maximum factor suffix length for each
\r
825 for (int i = m_variant_ - 1; i > 0; i --)
\r
827 int maxfactorlength = 0;
\r
829 for (int factor = m_factor_[i]; factor > 0; -- factor) {
\r
830 synchronized (m_utilStringBuffer_) {
\r
831 m_utilStringBuffer_.delete(0,
\r
832 m_utilStringBuffer_.length());
\r
834 = UCharacterUtility.getNullTermByteSubString(
\r
835 m_utilStringBuffer_,
\r
836 m_factorstring_, count);
\r
837 UCharacterName.add(set, m_utilStringBuffer_);
\r
838 if (m_utilStringBuffer_.length()
\r
842 = m_utilStringBuffer_.length();
\r
846 length += maxfactorlength;
\r
850 if (length > maxlength) {
\r
856 // private data members ------------------------------------------
\r
859 * Algorithmic data information
\r
861 private int m_rangestart_;
\r
862 private int m_rangeend_;
\r
863 private byte m_type_;
\r
864 private byte m_variant_;
\r
865 private char m_factor_[];
\r
866 private String m_prefix_;
\r
867 private byte m_factorstring_[];
\r
869 * Utility StringBuffer
\r
871 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
\r
873 * Utility int buffer
\r
875 private int m_utilIntBuffer_[] = new int[256];
\r
877 // private methods -----------------------------------------------
\r
880 * Gets the indexth string in each of the argument factor block
\r
881 * @param index array with each index corresponding to each factor block
\r
882 * @param length length of the array index
\r
883 * @return the combined string of the array of indexth factor string in
\r
886 private String getFactorString(int index[], int length)
\r
888 int size = m_factor_.length;
\r
889 if (index == null || length != size) {
\r
893 synchronized (m_utilStringBuffer_) {
\r
894 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
898 for (int i = 0; i <= size; i ++) {
\r
899 factor = m_factor_[i];
\r
900 count = UCharacterUtility.skipNullTermByteSubString(
\r
901 m_factorstring_, count, index[i]);
\r
902 count = UCharacterUtility.getNullTermByteSubString(
\r
903 m_utilStringBuffer_, m_factorstring_,
\r
906 count = UCharacterUtility.skipNullTermByteSubString(
\r
907 m_factorstring_, count,
\r
908 factor - index[i] - 1);
\r
911 return m_utilStringBuffer_.toString();
\r
916 * Compares the indexth string in each of the argument factor block with
\r
917 * the argument string
\r
918 * @param index array with each index corresponding to each factor block
\r
919 * @param length index array length
\r
920 * @param str string to compare with
\r
921 * @param offset of str to start comparison
\r
922 * @return true if string matches
\r
924 private boolean compareFactorString(int index[], int length, String str,
\r
927 int size = m_factor_.length;
\r
928 if (index == null || length != size)
\r
932 int strcount = offset;
\r
935 for (int i = 0; i <= size; i ++)
\r
937 factor = m_factor_[i];
\r
938 count = UCharacterUtility.skipNullTermByteSubString(
\r
939 m_factorstring_, count, index[i]);
\r
940 strcount = UCharacterUtility.compareNullTermByteSubString(str,
\r
941 m_factorstring_, strcount, count);
\r
942 if (strcount < 0) {
\r
947 count = UCharacterUtility.skipNullTermByteSubString(
\r
948 m_factorstring_, count, factor - index[i]);
\r
951 if (strcount != str.length()) {
\r
958 // package private data members --------------------------------------
\r
961 * Size of each groups
\r
963 int m_groupsize_ = 0;
\r
965 // package private methods --------------------------------------------
\r
968 * Sets the token data
\r
969 * @param token array of tokens
\r
970 * @param tokenstring array of string values of the tokens
\r
971 * @return false if there is a data error
\r
973 boolean setToken(char token[], byte tokenstring[])
\r
975 if (token != null && tokenstring != null && token.length > 0 &&
\r
976 tokenstring.length > 0) {
\r
977 m_tokentable_ = token;
\r
978 m_tokenstring_ = tokenstring;
\r
985 * Set the algorithm name information array
\r
986 * @param alg Algorithm information array
\r
987 * @return true if the group string offset has been set correctly
\r
989 boolean setAlgorithm(AlgorithmName alg[])
\r
991 if (alg != null && alg.length != 0) {
\r
992 m_algorithm_ = alg;
\r
999 * Sets the number of group and size of each group in number of char
\r
1000 * @param count number of groups
\r
1001 * @param size size of group in char
\r
1002 * @return true if group size is set correctly
\r
1004 boolean setGroupCountSize(int count, int size)
\r
1006 if (count <= 0 || size <= 0) {
\r
1009 m_groupcount_ = count;
\r
1010 m_groupsize_ = size;
\r
1015 * Sets the group name data
\r
1016 * @param group index information array
\r
1017 * @param groupstring name information array
\r
1018 * @return false if there is a data error
\r
1020 boolean setGroup(char group[], byte groupstring[])
\r
1022 if (group != null && groupstring != null && group.length > 0 &&
\r
1023 groupstring.length > 0) {
\r
1024 m_groupinfo_ = group;
\r
1025 m_groupstring_ = groupstring;
\r
1031 // private data members ----------------------------------------------
\r
1034 * Data used in unames.icu
\r
1036 private char m_tokentable_[];
\r
1037 private byte m_tokenstring_[];
\r
1038 private char m_groupinfo_[];
\r
1039 private byte m_groupstring_[];
\r
1040 private AlgorithmName m_algorithm_[];
\r
1043 * Group use. Note - access must be synchronized.
\r
1045 private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
\r
1046 private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
\r
1049 * Default name of the name datafile
\r
1051 private static final String NAME_FILE_NAME_ = ICUResourceBundle.ICU_BUNDLE+"/unames.icu";
\r
1053 * Shift count to retrieve group information
\r
1055 private static final int GROUP_SHIFT_ = 5;
\r
1057 * Mask to retrieve the offset for a particular character within a group
\r
1059 private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
\r
1061 * Default buffer size of datafile
\r
1063 private static final int NAME_BUFFER_SIZE_ = 100000;
\r
1066 * Position of offsethigh in group information array
\r
1068 private static final int OFFSET_HIGH_OFFSET_ = 1;
\r
1071 * Position of offsetlow in group information array
\r
1073 private static final int OFFSET_LOW_OFFSET_ = 2;
\r
1075 * Double nibble indicator, any nibble > this number has to be combined
\r
1076 * with its following nibble
\r
1078 private static final int SINGLE_NIBBLE_MAX_ = 11;
\r
1081 * Maximum length of character names (regular & 1.0).
\r
1083 //private static int MAX_NAME_LENGTH_ = 0;
\r
1085 * Maximum length of ISO comments.
\r
1087 //private static int MAX_ISO_COMMENT_LENGTH_ = 0;
\r
1090 * Set of chars used in character names (regular & 1.0).
\r
1091 * Chars are platform-dependent (can be EBCDIC).
\r
1093 private int m_nameSet_[] = new int[8];
\r
1095 * Set of chars used in ISO comments. (regular & 1.0).
\r
1096 * Chars are platform-dependent (can be EBCDIC).
\r
1098 private int m_ISOCommentSet_[] = new int[8];
\r
1100 * Utility StringBuffer
\r
1102 private StringBuffer m_utilStringBuffer_ = new StringBuffer();
\r
1104 * Utility int buffer
\r
1106 private int m_utilIntBuffer_[] = new int[2];
\r
1108 * Maximum ISO comment length
\r
1110 private int m_maxISOCommentLength_;
\r
1112 * Maximum name length
\r
1114 private int m_maxNameLength_;
\r
1116 * Singleton instance
\r
1118 private static UCharacterName INSTANCE_ = null;
\r
1120 * Type names used for extended names
\r
1122 private static final String TYPE_NAMES_[] = {"unassigned",
\r
1123 "uppercase letter",
\r
1124 "lowercase letter",
\r
1125 "titlecase letter",
\r
1126 "modifier letter",
\r
1128 "non spacing mark",
\r
1130 "combining spacing mark",
\r
1131 "decimal digit number",
\r
1134 "space separator",
\r
1136 "paragraph separator",
\r
1139 "private use area",
\r
1141 "dash punctuation",
\r
1142 "start punctuation",
\r
1143 "end punctuation",
\r
1144 "connector punctuation",
\r
1145 "other punctuation",
\r
1147 "currency symbol",
\r
1148 "modifier symbol",
\r
1150 "initial punctuation",
\r
1151 "final punctuation",
\r
1154 "trail surrogate"};
\r
1156 * Unknown type name
\r
1158 private static final String UNKNOWN_TYPE_NAME_ = "unknown";
\r
1160 * Not a character type
\r
1162 private static final int NON_CHARACTER_
\r
1163 = UCharacterCategory.CHAR_CATEGORY_COUNT;
\r
1165 * Lead surrogate type
\r
1167 private static final int LEAD_SURROGATE_
\r
1168 = UCharacterCategory.CHAR_CATEGORY_COUNT + 1;
\r
1170 * Trail surrogate type
\r
1172 private static final int TRAIL_SURROGATE_
\r
1173 = UCharacterCategory.CHAR_CATEGORY_COUNT + 2;
\r
1175 * Extended category count
\r
1177 static final int EXTENDED_CATEGORY_
\r
1178 = UCharacterCategory.CHAR_CATEGORY_COUNT + 3;
\r
1180 // private constructor ------------------------------------------------
\r
1183 * <p>Protected constructor for use in UCharacter.</p>
\r
1184 * @exception IOException thrown when data reading fails
\r
1186 private UCharacterName() throws IOException
\r
1188 InputStream is = ICUData.getRequiredStream(NAME_FILE_NAME_);
\r
1189 BufferedInputStream b = new BufferedInputStream(is, NAME_BUFFER_SIZE_);
\r
1190 UCharacterNameReader reader = new UCharacterNameReader(b);
\r
1191 reader.read(this);
\r
1195 // private methods ---------------------------------------------------
\r
1198 * Gets the algorithmic name for the argument character
\r
1199 * @param ch character to determine name for
\r
1200 * @param choice name choice
\r
1201 * @return the algorithmic name or null if not found
\r
1203 private String getAlgName(int ch, int choice)
\r
1205 // Do not write algorithmic Unicode 1.0 names because Unihan names are
\r
1206 // the same as the modern ones, extension A was only introduced with
\r
1207 // Unicode 3.0, and the Hangul syllable block was moved and changed
\r
1208 // around Unicode 1.1.5.
\r
1209 if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
\r
1210 // index in terms integer index
\r
1211 synchronized (m_utilStringBuffer_) {
\r
1212 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length());
\r
1214 for (int index = m_algorithm_.length - 1; index >= 0; index --)
\r
1216 if (m_algorithm_[index].contains(ch)) {
\r
1217 m_algorithm_[index].appendName(ch, m_utilStringBuffer_);
\r
1218 return m_utilStringBuffer_.toString();
\r
1227 * Getting the character with the tokenized argument name
\r
1228 * @param name of the character
\r
1229 * @return character with the tokenized argument name or -1 if character
\r
1232 private synchronized int getGroupChar(String name, int choice)
\r
1234 for (int i = 0; i < m_groupcount_; i ++) {
\r
1235 // populating the data set of grouptable
\r
1237 int startgpstrindex = getGroupLengths(i, m_groupoffsets_,
\r
1240 // shift out to function
\r
1241 int result = getGroupChar(startgpstrindex, m_grouplengths_, name,
\r
1243 if (result != -1) {
\r
1244 return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_)
\r
1252 * Compares and retrieve character if name is found within the argument
\r
1254 * @param index index where the set of names reside in the group block
\r
1255 * @param length list of lengths of the strings
\r
1256 * @param name character name to search for
\r
1257 * @param choice of either 1.0 or the most current unicode name
\r
1258 * @return relative character in the group which matches name, otherwise if
\r
1259 * not found, -1 will be returned
\r
1261 private int getGroupChar(int index, char length[], String name,
\r
1267 int namelen = name.length();
\r
1271 for (int result = 0; result <= LINES_PER_GROUP_; result ++) {
\r
1273 len = length[result];
\r
1275 if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME) {
\r
1276 int oldindex = index;
\r
1277 index += UCharacterUtility.skipByteSubString(m_groupstring_,
\r
1278 index, len, (byte)';');
\r
1279 len -= (index - oldindex);
\r
1282 // number of tokens is > the length of the name
\r
1283 // write each letter directly, and write a token word per token
\r
1284 for (count = 0; count < len && nindex != -1 && nindex < namelen;
\r
1286 b = m_groupstring_[index + count];
\r
1289 if (b >= m_tokentable_.length) {
\r
1290 if (name.charAt(nindex ++) != (b & 0xFF)) {
\r
1295 token = m_tokentable_[b & 0xFF];
\r
1296 if (token == 0xFFFE) {
\r
1297 // this is a lead byte for a double-byte token
\r
1298 token = m_tokentable_[b << 8 |
\r
1299 (m_groupstring_[index + count] & 0x00ff)];
\r
1302 if (token == 0xFFFF) {
\r
1303 if (name.charAt(nindex ++) != (b & 0xFF)) {
\r
1308 // compare token with name
\r
1309 nindex = UCharacterUtility.compareNullTermByteSubString(
\r
1310 name, m_tokenstring_, nindex, token);
\r
1315 if (namelen == nindex &&
\r
1316 (count == len || m_groupstring_[index + count] == ';')) {
\r
1326 * Gets the character extended type
\r
1327 * @param ch character to be tested
\r
1328 * @return extended type it is associated with
\r
1330 private static int getType(int ch)
\r
1332 if (UCharacterUtility.isNonCharacter(ch)) {
\r
1333 // not a character we return a invalid category count
\r
1334 return NON_CHARACTER_;
\r
1336 int result = UCharacter.getType(ch);
\r
1337 if (result == UCharacterCategory.SURROGATE) {
\r
1338 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
\r
1339 result = LEAD_SURROGATE_;
\r
1342 result = TRAIL_SURROGATE_;
\r
1349 * Getting the character with extended name of the form <....>.
\r
1350 * @param name of the character to be found
\r
1351 * @param choice name choice
\r
1352 * @return character associated with the name, -1 if such character is not
\r
1353 * found and -2 if we should continue with the search.
\r
1355 private static int getExtendedChar(String name, int choice)
\r
1357 if (name.charAt(0) == '<') {
\r
1358 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) {
\r
1359 int endIndex = name.length() - 1;
\r
1360 if (name.charAt(endIndex) == '>') {
\r
1361 int startIndex = name.lastIndexOf('-');
\r
1362 if (startIndex >= 0) { // We've got a category.
\r
1366 result = Integer.parseInt(
\r
1367 name.substring(startIndex, endIndex),
\r
1370 catch (NumberFormatException e) {
\r
1373 // Now validate the category name. We could use a
\r
1374 // binary search, or a trie, if we really wanted to.
\r
1375 String type = name.substring(1, startIndex - 1);
\r
1376 int length = TYPE_NAMES_.length;
\r
1377 for (int i = 0; i < length; ++ i) {
\r
1378 if (type.compareTo(TYPE_NAMES_[i]) == 0) {
\r
1379 if (getType(result) == i) {
\r
1393 // sets of name characters, maximum name lengths -----------------------
\r
1396 * Adds a codepoint into a set of ints.
\r
1397 * Equivalent to SET_ADD.
\r
1398 * @param set set to add to
\r
1399 * @param ch 16 bit char to add
\r
1401 private static void add(int set[], char ch)
\r
1403 set[ch >>> 5] |= 1 << (ch & 0x1f);
\r
1407 * Checks if a codepoint is a part of a set of ints.
\r
1408 * Equivalent to SET_CONTAINS.
\r
1409 * @param set set to check in
\r
1410 * @param ch 16 bit char to check
\r
1411 * @return true if codepoint is part of the set, false otherwise
\r
1413 private static boolean contains(int set[], char ch)
\r
1415 return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0;
\r
1419 * Adds all characters of the argument str and gets the length
\r
1420 * Equivalent to calcStringSetLength.
\r
1421 * @param set set to add all chars of str to
\r
1422 * @param str string to add
\r
1424 private static int add(int set[], String str)
\r
1426 int result = str.length();
\r
1428 for (int i = result - 1; i >= 0; i --) {
\r
1429 add(set, str.charAt(i));
\r
1435 * Adds all characters of the argument str and gets the length
\r
1436 * Equivalent to calcStringSetLength.
\r
1437 * @param set set to add all chars of str to
\r
1438 * @param str string to add
\r
1440 private static int add(int set[], StringBuffer str)
\r
1442 int result = str.length();
\r
1444 for (int i = result - 1; i >= 0; i --) {
\r
1445 add(set, str.charAt(i));
\r
1451 * Adds all algorithmic names into the name set.
\r
1452 * Equivalent to part of calcAlgNameSetsLengths.
\r
1453 * @param maxlength length to compare to
\r
1454 * @return the maximum length of any possible algorithmic name if it is >
\r
1455 * maxlength, otherwise maxlength is returned.
\r
1457 private int addAlgorithmName(int maxlength)
\r
1460 for (int i = m_algorithm_.length - 1; i >= 0; i --) {
\r
1461 result = m_algorithm_[i].add(m_nameSet_, maxlength);
\r
1462 if (result > maxlength) {
\r
1463 maxlength = result;
\r
1470 * Adds all extended names into the name set.
\r
1471 * Equivalent to part of calcExtNameSetsLengths.
\r
1472 * @param maxlength length to compare to
\r
1473 * @return the maxlength of any possible extended name.
\r
1475 private int addExtendedName(int maxlength)
\r
1477 for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) {
\r
1478 // for each category, count the length of the category name
\r
1482 // 6 for most hex digits per code point
\r
1483 int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]);
\r
1484 if (length > maxlength) {
\r
1485 maxlength = length;
\r
1492 * Adds names of a group to the argument set.
\r
1493 * Equivalent to calcNameSetLength.
\r
1494 * @param offset of the group name string in byte count
\r
1495 * @param length of the group name string
\r
1496 * @param tokenlength array to store the length of each token
\r
1497 * @param set to add to
\r
1498 * @return the length of the name string and the length of the group
\r
1501 private int[] addGroupName(int offset, int length, byte tokenlength[],
\r
1504 int resultnlength = 0;
\r
1505 int resultplength = 0;
\r
1506 while (resultplength < length) {
\r
1507 char b = (char)(m_groupstring_[offset + resultplength] & 0xff);
\r
1513 if (b >= m_tokentable_.length) {
\r
1514 add(set, b); // implicit letter
\r
1518 char token = m_tokentable_[b & 0x00ff];
\r
1519 if (token == 0xFFFE) {
\r
1520 // this is a lead byte for a double-byte token
\r
1521 b = (char)(b << 8 | (m_groupstring_[offset + resultplength]
\r
1523 token = m_tokentable_[b];
\r
1526 if (token == 0xFFFF) {
\r
1531 // count token word
\r
1532 // use cached token length
\r
1533 byte tlength = tokenlength[b];
\r
1534 if (tlength == 0) {
\r
1535 synchronized (m_utilStringBuffer_) {
\r
1536 m_utilStringBuffer_.delete(0,
\r
1537 m_utilStringBuffer_.length());
\r
1538 UCharacterUtility.getNullTermByteSubString(
\r
1539 m_utilStringBuffer_, m_tokenstring_,
\r
1541 tlength = (byte)add(set, m_utilStringBuffer_);
\r
1543 tokenlength[b] = tlength;
\r
1545 resultnlength += tlength;
\r
1549 m_utilIntBuffer_[0] = resultnlength;
\r
1550 m_utilIntBuffer_[1] = resultplength;
\r
1551 return m_utilIntBuffer_;
\r
1555 * Adds names of all group to the argument set.
\r
1556 * Sets the data member m_max*Length_.
\r
1557 * Method called only once.
\r
1558 * Equivalent to calcGroupNameSetsLength.
\r
1559 * @param maxlength length to compare to
\r
1561 private void addGroupName(int maxlength)
\r
1563 int maxisolength = 0;
\r
1564 char offsets[] = new char[LINES_PER_GROUP_ + 2];
\r
1565 char lengths[] = new char[LINES_PER_GROUP_ + 2];
\r
1566 byte tokenlengths[] = new byte[m_tokentable_.length];
\r
1568 // enumerate all groups
\r
1569 // for (int i = m_groupcount_ - 1; i >= 0; i --) {
\r
1570 for (int i = 0; i < m_groupcount_ ; i ++) {
\r
1571 int offset = getGroupLengths(i, offsets, lengths);
\r
1572 // enumerate all lines in each group
\r
1573 // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0;
\r
1574 // linenumber --) {
\r
1575 for (int linenumber = 0; linenumber < LINES_PER_GROUP_;
\r
1577 int lineoffset = offset + offsets[linenumber];
\r
1578 int length = lengths[linenumber];
\r
1579 if (length == 0) {
\r
1583 // read regular name
\r
1584 int parsed[] = addGroupName(lineoffset, length, tokenlengths,
\r
1586 if (parsed[0] > maxlength) {
\r
1587 // 0 for name length
\r
1588 maxlength = parsed[0];
\r
1590 lineoffset += parsed[1];
\r
1591 if (parsed[1] >= length) {
\r
1592 // 1 for parsed group string length
\r
1595 length -= parsed[1];
\r
1596 // read Unicode 1.0 name
\r
1597 parsed = addGroupName(lineoffset, length, tokenlengths,
\r
1599 if (parsed[0] > maxlength) {
\r
1600 // 0 for name length
\r
1601 maxlength = parsed[0];
\r
1603 lineoffset += parsed[1];
\r
1604 if (parsed[1] >= length) {
\r
1605 // 1 for parsed group string length
\r
1608 length -= parsed[1];
\r
1609 // read ISO comment
\r
1610 parsed = addGroupName(lineoffset, length, tokenlengths,
\r
1611 m_ISOCommentSet_);
\r
1612 if (parsed[1] > maxisolength) {
\r
1613 maxisolength = length;
\r
1618 // set gMax... - name length last for threading
\r
1619 m_maxISOCommentLength_ = maxisolength;
\r
1620 m_maxNameLength_ = maxlength;
\r
1624 * Sets up the name sets and the calculation of the maximum lengths.
\r
1625 * Equivalent to calcNameSetsLengths.
\r
1627 private boolean initNameSetsLengths()
\r
1629 if (m_maxNameLength_ > 0) {
\r
1633 String extra = "0123456789ABCDEF<>-";
\r
1634 // set hex digits, used in various names, and <>-, used in extended
\r
1636 for (int i = extra.length() - 1; i >= 0; i --) {
\r
1637 add(m_nameSet_, extra.charAt(i));
\r
1640 // set sets and lengths from algorithmic names
\r
1641 m_maxNameLength_ = addAlgorithmName(0);
\r
1642 // set sets and lengths from extended names
\r
1643 m_maxNameLength_ = addExtendedName(m_maxNameLength_);
\r
1644 // set sets and lengths from group names, set global maximum values
\r
1645 addGroupName(m_maxNameLength_);
\r
1650 * Converts the char set cset into a Unicode set uset.
\r
1651 * Equivalent to charSetToUSet.
\r
1652 * @param set Set of 256 bit flags corresponding to a set of chars.
\r
1653 * @param uset USet to receive characters. Existing contents are deleted.
\r
1655 private void convert(int set[], UnicodeSet uset)
\r
1658 if (!initNameSetsLengths()) {
\r
1662 // build a char string with all chars that are used in character names
\r
1663 for (char c = 255; c > 0; c --) {
\r
1664 if (contains(set, c)) {
\r