2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
10 * <p>A <code>CollationKey</code> represents a <code>String</code>
\r
11 * under the rules of a specific <code>Collator</code>
\r
12 * object. Comparing two <code>CollationKey</code>s returns the
\r
13 * relative order of the <code>String</code>s they represent.</p>
\r
15 * <p>Since the rule set of <code>Collator</code>s can differ, the
\r
16 * sort orders of the same string under two different
\r
17 * <code>Collator</code>s might differ. Hence comparing
\r
18 * <code>CollationKey</code>s generated from different
\r
19 * <code>Collator</code>s can give incorrect results.</p>
\r
21 * <p>Both the method
\r
22 * <code>CollationKey.compareTo(CollationKey)</code> and the method
\r
23 * <code>Collator.compare(String, String)</code> compare two strings
\r
24 * and returns their relative order. The performance characterictics
\r
25 * of these two approaches can differ.</p>
\r
27 * <p>During the construction of a <code>CollationKey</code>, the
\r
28 * entire source string is examined and processed into a series of
\r
29 * bits terminated by a null, that are stored in the <code>CollationKey</code>.
\r
30 * When <code>CollationKey.compareTo(CollationKey)</code> executes, it
\r
31 * performs bitwise comparison on the bit sequences. This can incurs
\r
32 * startup cost when creating the <code>CollationKey</code>, but once
\r
33 * the key is created, binary comparisons are fast. This approach is
\r
34 * recommended when the same strings are to be compared over and over
\r
37 * <p>On the other hand, implementations of
\r
38 * <code>Collator.compare(String, String)</code> can examine and
\r
39 * process the strings only until the first characters differing in
\r
40 * order. This approach is recommended if the strings are to be
\r
41 * compared only once.</p>
\r
43 * <p>More information about the composition of the bit sequence can
\r
45 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
46 * user guide</a>.</p>
\r
48 * <p>The following example shows how <code>CollationKey</code>s can be used
\r
49 * to sort a list of <code>String</code>s.</p>
\r
52 * // Create an array of CollationKeys for the Strings to be sorted.
\r
53 * Collator myCollator = Collator.getInstance();
\r
54 * CollationKey[] keys = new CollationKey[3];
\r
55 * keys[0] = myCollator.getCollationKey("Tom");
\r
56 * keys[1] = myCollator.getCollationKey("Dick");
\r
57 * keys[2] = myCollator.getCollationKey("Harry");
\r
62 * // Inside body of sort routine, compare keys this way
\r
63 * if( keys[i].compareTo( keys[j] ) > 0 )
\r
64 * // swap keys[i] and keys[j]
\r
68 * // Finally, when we've returned from sort.
\r
69 * System.out.println( keys[0].getSourceString() );
\r
70 * System.out.println( keys[1].getSourceString() );
\r
71 * System.out.println( keys[2].getSourceString() );
\r
76 * This class is not subclassable
\r
79 * @see RuleBasedCollator
\r
80 * @author Syn Wee Quek
\r
83 public final class CollationKey implements Comparable<CollationKey>
\r
85 // public inner classes -------------------------------------------------
\r
88 * Options that used in the API CollationKey.getBound() for getting a
\r
89 * CollationKey based on the bound mode requested.
\r
92 public static final class BoundMode
\r
95 * do not change the values assigned to the members of this enum.
\r
96 * Underlying code depends on them having these numbers
\r
103 public static final int LOWER = 0;
\r
106 * Upper bound that will match strings of exact size
\r
109 public static final int UPPER = 1;
\r
112 * Upper bound that will match all the strings that have the same
\r
113 * initial substring as the given string
\r
116 public static final int UPPER_LONG = 2;
\r
119 * Number of bound mode
\r
122 public static final int COUNT = 3;
\r
125 * Private Constructor
\r
128 private BoundMode(){}
\r
132 // public constructor ---------------------------------------------------
\r
135 * CollationKey constructor.
\r
136 * This constructor is given public access, unlike the JDK version, to
\r
137 * allow access to users extending the Collator class. See
\r
138 * {@link Collator#getCollationKey(String)}.
\r
139 * @param source string this CollationKey is to represent
\r
140 * @param key array of bytes that represent the collation order of argument
\r
141 * source terminated by a null
\r
145 public CollationKey(String source, byte key[])
\r
147 m_source_ = source;
\r
154 * CollationKey constructor that forces key to release its internal byte
\r
155 * array for adoption. key will have a null byte array after this
\r
157 * @param source string this CollationKey is to represent
\r
158 * @param key RawCollationKey object that represents the collation order of
\r
159 * argument source.
\r
161 * @see RawCollationKey
\r
164 public CollationKey(String source, RawCollationKey key)
\r
166 m_source_ = source;
\r
167 m_key_ = key.releaseBytes();
\r
172 // public getters -------------------------------------------------------
\r
175 * Return the source string that this CollationKey represents.
\r
176 * @return source string that this CollationKey represents
\r
179 public String getSourceString()
\r
185 * <p>Duplicates and returns the value of this CollationKey as a sequence
\r
186 * of big-endian bytes terminated by a null.</p>
\r
188 * <p>If two CollationKeys can be legitimately compared, then one can
\r
189 * compare the byte arrays of each to obtain the same result, e.g.
\r
191 * byte key1[] = collationkey1.toByteArray();
\r
192 * byte key2[] = collationkey2.toByteArray();
\r
193 * int key, targetkey;
\r
196 * key = key1[i] & 0xFF;
\r
197 * targetkey = key2[i] & 0xFF;
\r
198 * if (key < targetkey) {
\r
199 * System.out.println("String 1 is less than string 2");
\r
202 * if (targetkey < key) {
\r
203 * System.out.println("String 1 is more than string 2");
\r
206 * } while (key != 0 && targetKey != 0);
\r
208 * System.out.println("Strings are equal.");
\r
211 * @return CollationKey value in a sequence of big-endian byte bytes
\r
212 * terminated by a null.
\r
215 public byte[] toByteArray()
\r
219 if (m_key_[length] == 0) {
\r
225 byte result[] = new byte[length];
\r
226 System.arraycopy(m_key_, 0, result, 0, length);
\r
230 // public other methods -------------------------------------------------
\r
233 * <p>Compare this CollationKey to another CollationKey. The
\r
234 * collation rules of the Collator that created this key are
\r
237 * <p><strong>Note:</strong> Comparison between CollationKeys
\r
238 * created by different Collators might return incorrect
\r
239 * results. See class documentation.</p>
\r
241 * @param target target CollationKey
\r
242 * @return an integer value. If the value is less than zero this CollationKey
\r
243 * is less than than target, if the value is zero they are equal, and
\r
244 * if the value is greater than zero this CollationKey is greater
\r
246 * @exception NullPointerException is thrown if argument is null.
\r
247 * @see Collator#compare(String, String)
\r
250 public int compareTo(CollationKey target)
\r
252 for (int i = 0;; ++i) {
\r
253 int l = m_key_[i]&0xff;
\r
254 int r = target.m_key_[i]&0xff;
\r
257 } else if (l > r) {
\r
259 } else if (l == 0) {
\r
266 * <p>Compare this CollationKey and the specified Object for
\r
267 * equality. The collation rules of the Collator that created
\r
268 * this key are applied.</p>
\r
270 * <p>See note in compareTo(CollationKey) for warnings about
\r
271 * possible incorrect results.</p>
\r
273 * @param target the object to compare to.
\r
274 * @return true if the two keys compare as equal, false otherwise.
\r
275 * @see #compareTo(CollationKey)
\r
276 * @exception ClassCastException is thrown when the argument is not
\r
277 * a CollationKey. NullPointerException is thrown when the argument
\r
281 public boolean equals(Object target)
\r
283 if (!(target instanceof CollationKey)) {
\r
287 return equals((CollationKey)target);
\r
292 * Compare this CollationKey and the argument target CollationKey for
\r
295 * rules of the Collator object which created these objects are applied.
\r
298 * See note in compareTo(CollationKey) for warnings of incorrect results
\r
300 * @param target the CollationKey to compare to.
\r
301 * @return true if two objects are equal, false otherwise.
\r
302 * @exception NullPointerException is thrown when the argument is null.
\r
305 public boolean equals(CollationKey target)
\r
307 if (this == target) {
\r
310 if (target == null) {
\r
313 CollationKey other = target;
\r
316 if (m_key_[i] != other.m_key_[i]) {
\r
319 if (m_key_[i] == 0) {
\r
328 * <p>Returns a hash code for this CollationKey. The hash value is calculated
\r
329 * on the key itself, not the String from which the key was created. Thus
\r
330 * if x and y are CollationKeys, then x.hashCode(x) == y.hashCode()
\r
331 * if x.equals(y) is true. This allows language-sensitive comparison in a
\r
334 * @return the hash value.
\r
337 public int hashCode()
\r
339 if (m_hashCode_ == 0) {
\r
340 if (m_key_ == null) {
\r
344 int size = m_key_.length >> 1;
\r
345 StringBuilder key = new StringBuilder(size);
\r
347 while (m_key_[i] != 0 && m_key_[i + 1] != 0) {
\r
348 key.append((char)((m_key_[i] << 8) | m_key_[i + 1]));
\r
351 if (m_key_[i] != 0) {
\r
352 key.append((char)(m_key_[i] << 8));
\r
354 m_hashCode_ = key.toString().hashCode();
\r
357 return m_hashCode_;
\r
362 * Produce a bound for the sort order of a given collation key and a
\r
363 * strength level. This API does not attempt to find a bound for the
\r
364 * CollationKey String representation, hence null will be returned in its
\r
368 * Resulting bounds can be used to produce a range of strings that are
\r
369 * between upper and lower bounds. For example, if bounds are produced
\r
370 * for a sortkey of string "smith", strings between upper and lower
\r
371 * bounds with primary strength would include "Smith", "SMITH", "sMiTh".
\r
374 * There are two upper bounds that can be produced. If BoundMode.UPPER
\r
375 * is produced, strings matched would be as above. However, if a bound
\r
376 * is produced using BoundMode.UPPER_LONG is used, the above example will
\r
377 * also match "Smithsonian" and similar.
\r
380 * For more on usage, see example in test procedure
\r
381 * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/collator/CollationAPITest.java">
\r
382 * src/com/ibm/icu/dev/test/collator/CollationAPITest/TestBounds.
\r
386 * Collation keys produced may be compared using the <TT>compare</TT> API.
\r
388 * @param boundType Mode of bound required. It can be BoundMode.LOWER, which
\r
389 * produces a lower inclusive bound, BoundMode.UPPER, that
\r
390 * produces upper bound that matches strings of the same
\r
391 * length or BoundMode.UPPER_LONG that matches strings that
\r
392 * have the same starting substring as the source string.
\r
393 * @param noOfLevels Strength levels required in the resulting bound
\r
394 * (for most uses, the recommended value is PRIMARY). This
\r
395 * strength should be less than the maximum strength of
\r
396 * this CollationKey.
\r
397 * See users guide for explanation on the strength levels a
\r
398 * collation key can have.
\r
399 * @return the result bounded CollationKey with a valid sort order but
\r
400 * a null String representation.
\r
401 * @exception IllegalArgumentException thrown when the strength level
\r
402 * requested is higher than or equal to the strength in this
\r
404 * In the case of an Exception, information
\r
405 * about the maximum strength to use will be returned in the
\r
406 * Exception. The user can then call getBound() again with the
\r
407 * appropriate strength.
\r
408 * @see CollationKey
\r
409 * @see CollationKey.BoundMode
\r
410 * @see Collator#PRIMARY
\r
411 * @see Collator#SECONDARY
\r
412 * @see Collator#TERTIARY
\r
413 * @see Collator#QUATERNARY
\r
414 * @see Collator#IDENTICAL
\r
417 public CollationKey getBound(int boundType, int noOfLevels)
\r
419 // Scan the string until we skip enough of the key OR reach the end of
\r
422 int keystrength = Collator.PRIMARY;
\r
424 if (noOfLevels > Collator.PRIMARY) {
\r
425 while (offset < m_key_.length && m_key_[offset] != 0) {
\r
426 if (m_key_[offset ++]
\r
427 == RuleBasedCollator.SORT_LEVEL_TERMINATOR_) {
\r
430 if (noOfLevels == Collator.PRIMARY
\r
431 || offset == m_key_.length || m_key_[offset] == 0) {
\r
439 if (noOfLevels > 0) {
\r
440 throw new IllegalArgumentException(
\r
441 "Source collation key has only "
\r
443 + " strength level. Call getBound() again "
\r
444 + " with noOfLevels < " + keystrength);
\r
447 // READ ME: this code assumes that the values for BoundMode variables
\r
448 // will not changes. They are set so that the enum value corresponds to
\r
449 // the number of extra bytes each bound type needs.
\r
450 byte resultkey[] = new byte[offset + boundType + 1];
\r
451 System.arraycopy(m_key_, 0, resultkey, 0, offset);
\r
452 switch (boundType) {
\r
453 case BoundMode.LOWER: // = 0
\r
454 // Lower bound just gets terminated. No extra bytes
\r
456 case BoundMode.UPPER: // = 1
\r
457 // Upper bound needs one extra byte
\r
458 resultkey[offset ++] = 2;
\r
460 case BoundMode.UPPER_LONG: // = 2
\r
461 // Upper long bound needs two extra bytes
\r
462 resultkey[offset ++] = (byte)0xFF;
\r
463 resultkey[offset ++] = (byte)0xFF;
\r
466 throw new IllegalArgumentException(
\r
467 "Illegal boundType argument");
\r
469 resultkey[offset ++] = 0;
\r
470 return new CollationKey(null, resultkey);
\r
477 * Merges this CollationKey with another. Only the sorting order of the
\r
478 * CollationKeys will be merged. This API does not attempt to merge the
\r
479 * String representations of the CollationKeys, hence null will be returned
\r
480 * as the String representation.
\r
483 * The strength levels are merged with their corresponding counterparts
\r
484 * (PRIMARIES with PRIMARIES, SECONDARIES with SECONDARIES etc.).
\r
487 * The merged String representation of the result CollationKey will be a
\r
488 * concatenation of the String representations of the 2 source
\r
492 * Between the values from the same level a separator is inserted.
\r
493 * example (uncompressed):
\r
495 * 191B1D 01 050505 01 910505 00 and 1F2123 01 050505 01 910505 00
\r
496 * will be merged as
\r
497 * 191B1D 02 1F212301 050505 02 050505 01 910505 02 910505 00
\r
501 * This allows for concatenating of first and last names for sorting, among
\r
505 * @param source CollationKey to merge with
\r
506 * @return a CollationKey that contains the valid merged sorting order
\r
507 * with a null String representation,
\r
508 * i.e. <tt>new CollationKey(null, merge_sort_order)</tt>
\r
509 * @exception IllegalArgumentException thrown if source CollationKey
\r
510 * argument is null or of 0 length.
\r
513 public CollationKey merge(CollationKey source)
\r
516 if (source == null || source.getLength() == 0) {
\r
517 throw new IllegalArgumentException(
\r
518 "CollationKey argument can not be null or of 0 length");
\r
521 getLength(); // gets the length of this sort key
\r
522 int sourcelength = source.getLength();
\r
523 // 1 extra for the last strength that has no seperators
\r
524 byte result[] = new byte[m_length_ + sourcelength + 2];
\r
526 // merge the sort keys with the same number of levels
\r
529 int sourceindex = 0;
\r
531 // while both have another level
\r
532 // copy level from src1 not including 00 or 01
\r
534 while (m_key_[index] < 0 || m_key_[index] >= MERGE_SEPERATOR_) {
\r
535 result[rindex ++] = m_key_[index ++];
\r
538 // add a 02 merge separator
\r
539 result[rindex ++] = MERGE_SEPERATOR_;
\r
541 // copy level from src2 not including 00 or 01
\r
542 while (source.m_key_[sourceindex] < 0
\r
543 || source.m_key_[sourceindex] >= MERGE_SEPERATOR_) {
\r
544 result[rindex ++] = source.m_key_[sourceindex ++];
\r
547 // if both sort keys have another level, then add a 01 level
\r
548 // separator and continue
\r
549 if (m_key_[index] == RuleBasedCollator.SORT_LEVEL_TERMINATOR_
\r
550 && source.m_key_[sourceindex]
\r
551 == RuleBasedCollator.SORT_LEVEL_TERMINATOR_) {
\r
554 result[rindex ++] = RuleBasedCollator.SORT_LEVEL_TERMINATOR_;
\r
561 // here, at least one sort key is finished now, but the other one
\r
562 // might have some contents left from containing more levels;
\r
563 // that contents is just appended to the result
\r
564 if (m_key_[index] != 0) {
\r
565 System.arraycopy(m_key_, index, result, rindex,
\r
566 m_length_ - index);
\r
568 else if (source.m_key_[sourceindex] != 0) {
\r
569 System.arraycopy(source.m_key_, sourceindex, result, rindex,
\r
570 source.m_length_ - sourceindex);
\r
572 result[result.length - 1] = 0;
\r
574 // trust that neither sort key contained illegally embedded zero bytes
\r
575 return new CollationKey(null, result);
\r
578 // private data members -------------------------------------------------
\r
581 * Sequence of bytes that represents the sort key
\r
583 private byte m_key_[];
\r
586 * Source string this CollationKey represents
\r
588 private String m_source_;
\r
591 * Hash code for the key
\r
593 private int m_hashCode_;
\r
595 * Gets the length of this CollationKey
\r
597 private int m_length_;
\r
599 * Collation key merge seperator
\r
601 private static final int MERGE_SEPERATOR_ = 2;
\r
603 // private methods ------------------------------------------------------
\r
606 * Gets the length of the CollationKey
\r
607 * @return length of the CollationKey
\r
609 private int getLength()
\r
611 if (m_length_ >= 0) {
\r
614 int length = m_key_.length;
\r
615 for (int index = 0; index < length; index ++) {
\r
616 if (m_key_[index] == 0) {
\r
621 m_length_ = length;
\r