2 *******************************************************************************
3 * Copyright (C) 1996-2013, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 *******************************************************************************
10 package com.ibm.icu.text;
13 * import java.text.StringCharacterIterator;
14 * import java.text.CharacterIterator;
16 import java.text.CharacterIterator;
17 import java.util.MissingResourceException;
19 import com.ibm.icu.impl.CharacterIteratorWrapper;
20 import com.ibm.icu.impl.ICUDebug;
21 import com.ibm.icu.impl.Norm2AllModes;
22 import com.ibm.icu.impl.Normalizer2Impl;
23 import com.ibm.icu.impl.StringUCharacterIterator;
24 import com.ibm.icu.impl.UCharacterProperty;
25 import com.ibm.icu.lang.UCharacter;
28 * <p><code>CollationElementIterator</code> is an iterator created by
29 * a RuleBasedCollator to walk through a string. The return result of
30 * each iteration is a 32-bit collation element that defines the
31 * ordering priority of the next character or sequence of characters
32 * in the source string.</p>
34 * <p>For illustration, consider the following in Spanish:
37 * "ca" -> the first collation element is collation_element('c') and second
38 * collation element is collation_element('a').
40 * Since "ch" in Spanish sorts as one entity, the below example returns one
41 * collation element for the two characters 'c' and 'h'
43 * "cha" -> the first collation element is collation_element('ch') and second
44 * collation element is collation_element('a').
50 * Since the character 'æ' is a composed character of 'a' and 'e', the
51 * iterator returns two collation elements for the single character 'æ'
53 * "æb" -> the first collation element is collation_element('a'), the
54 * second collation element is collation_element('e'), and the
55 * third collation element is collation_element('b').
60 * <p>For collation ordering comparison, the collation element results
61 * can not be compared simply by using basic arithmetric operators,
62 * e.g. <, == or >, further processing has to be done. Details
63 * can be found in the ICU
64 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
65 * user guide</a>. An example of using the CollationElementIterator
66 * for collation ordering comparison is the class
67 * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
69 * <p>To construct a CollationElementIterator object, users
70 * call the method getCollationElementIterator() on a
71 * RuleBasedCollator that defines the desired sorting order.</p>
76 * String testString = "This is a test";
77 * RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
78 * CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
79 * int primaryOrder = iterator.IGNORABLE;
80 * while (primaryOrder != iterator.NULLORDER) {
81 * int order = iterator.next();
82 * if (order != iterator.IGNORABLE &&
83 * order != iterator.NULLORDER) {
84 * // order is valid, not ignorable and we have not passed the end
85 * // of the iteration, we do something
86 * primaryOrder = CollationElementIterator.primaryOrder(order);
87 * System.out.println("Next primary order 0x" +
88 * Integer.toHexString(primaryOrder));
95 * The method next() returns the collation order of the next character based on
96 * the comparison level of the collator. The method previous() returns the
97 * collation order of the previous character based on the comparison level of
98 * the collator. The Collation Element Iterator moves only in one direction
99 * between calls to reset(), setOffset(), or setText(). That is, next() and
100 * previous() can not be inter-used. Whenever previous() is to be called after
101 * next() or vice versa, reset(), setOffset() or setText() has to be called first
102 * to reset the status, shifting current position to either the end or the start of
103 * the string (reset() or setText()), or the specified position (setOffset()).
104 * Hence at the next call of next() or previous(), the first or last collation order,
105 * or collation order at the specified position will be returned. If a change of
106 * direction is done without one of these calls, the result is undefined.
109 * This class is not subclassable.
112 * @see RuleBasedCollator
114 * @author Syn Wee Quek
117 public final class CollationElementIterator
121 // public data members --------------------------------------------------
124 * <p>This constant is returned by the iterator in the methods
125 * next() and previous() when the end or the beginning of the
126 * source string has been reached, and there are no more valid
127 * collation elements to return.</p>
129 * <p>See class documentation for an example of use.</p>
133 public final static int NULLORDER = 0xffffffff;
136 * <p>This constant is returned by the iterator in the methods
137 * next() and previous() when a collation element result is to be
140 * <p>See class documentation for an example of use.</p>
144 public static final int IGNORABLE = 0;
146 // public methods -------------------------------------------------------
148 // public getters -------------------------------------------------------
151 * <p>Returns the character offset in the source string
152 * corresponding to the next collation element. I.e., getOffset()
153 * returns the position in the source string corresponding to the
154 * collation element that will be returned by the next call to
155 * next() or previous(). This value could be any of:
157 * <li> The index of the <b>first</b> character corresponding to
158 * the next collation element. (This means that if
159 * <code>setOffset(offset)</code> sets the index in the middle of
160 * a contraction, <code>getOffset()</code> returns the index of
161 * the first character in the contraction, which may not be equal
162 * to the original offset that was set. Hence calling getOffset()
163 * immediately after setOffset(offset) does not guarantee that the
164 * original offset set will be returned.)
165 * <li> If normalization is on, the index of the <b>immediate</b>
166 * subsequent character, or composite character with the first
167 * character, having a combining class of 0.
168 * <li> The length of the source string, if iteration has reached
172 * @return The character offset in the source string corresponding to the
173 * collation element that will be returned by the next call to
174 * next() or previous().
177 public int getOffset()
179 if (m_bufferOffset_ != -1) {
185 return m_source_.getIndex();
190 * <p> Returns the maximum length of any expansion sequence that ends with
191 * the specified collation element. If there is no expansion with this
192 * collation element as the last element, returns 1.
194 * @param ce a collation element returned by previous() or next().
195 * @return the maximum length of any expansion sequence ending
196 * with the specified collation element.
199 public int getMaxExpansion(int ce)
202 int limit = m_collator_.m_expansionEndCE_.length;
203 long unsignedce = ce & 0xFFFFFFFFl;
204 while (start < limit - 1) {
205 int mid = start + ((limit - start) >> 1);
206 long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
207 if (unsignedce <= midce) {
215 if (m_collator_.m_expansionEndCE_[start] == ce) {
216 result = m_collator_.m_expansionEndCEMaxSize_[start];
218 else if (limit < m_collator_.m_expansionEndCE_.length &&
219 m_collator_.m_expansionEndCE_[limit] == ce) {
220 result = m_collator_.m_expansionEndCEMaxSize_[limit];
222 else if ((ce & 0xFFFF) == 0x00C0) {
228 // public other methods -------------------------------------------------
231 * <p> Resets the cursor to the beginning of the string. The next
232 * call to next() or previous() will return the first and last
233 * collation element in the string, respectively.</p>
235 * <p>If the RuleBasedCollator used by this iterator has had its
236 * attributes changed, calling reset() will reinitialize the
237 * iterator to use the new attributes.</p>
243 m_source_.setToStart();
244 updateInternalState();
245 m_direction = 0; // initial state
249 * <p>Get the next collation element in the source string.</p>
251 * <p>This iterator iterates over a sequence of collation elements
252 * that were built from the string. Because there isn't
253 * necessarily a one-to-one mapping from characters to collation
254 * elements, this doesn't mean the same thing as "return the
255 * collation element [or ordering priority] of the next character
256 * in the string".</p>
258 * <p>This function returns the collation element that the
259 * iterator is currently pointing to, and then updates the
260 * internal pointer to point to the next element.</p>
262 * @return the next collation element or NULLORDER if the end of the
263 * iteration has been reached.
268 assert m_direction >= 0;
271 m_isForwards_ = true;
272 if (m_CEBufferSize_ > 0) {
273 if (m_CEBufferOffset_ < m_CEBufferSize_) {
274 // if there are expansions left in the buffer, we return it
275 return m_CEBuffer_[m_CEBufferOffset_ ++];
278 m_CEBufferOffset_ = 0;
281 int result = NULLORDER;
284 int ch_int = nextChar();
285 if (ch_int == UCharacterIterator.DONE) {
289 if (m_collator_.m_isHiragana4_) {
290 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
291 * based on whether the previous codepoint was Hiragana or Katakana.
293 m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
294 ((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
298 // For latin-1 characters we never need to fall back to the UCA
299 // table because all of the UCA data is replicated in the
300 // latinOneMapping array.
301 // Except: Special CEs can result in CE_NOT_FOUND_,
302 // for example if the default entry for a prefix-special is "not found",
303 // and we do need to fall back to the UCA in such a case.
304 // TODO: It would be better if tailoring specials never resulted in "not found"
305 // unless the corresponding UCA result is also "not found".
306 // That would require a change in the ICU4J collator-from-rule builder.
307 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
309 result = m_collator_.m_trie_.getLeadValue(ch);
311 if (!RuleBasedCollator.isSpecial(result)) {
314 if (result != CE_NOT_FOUND_) {
315 result = nextSpecial(m_collator_, result, ch);
317 if (result == CE_NOT_FOUND_) {
318 // couldn't find a good CE in the tailoring
319 if (RuleBasedCollator.UCA_ != null) {
320 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
321 if (RuleBasedCollator.isSpecial(result)) {
322 // UCA also gives us a special CE
323 result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
326 if(result == CE_NOT_FOUND_) {
327 // maybe there is no UCA, unlikely in Java, but ported for consistency
328 result = nextImplicit(ch);
331 } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
337 * <p>Get the previous collation element in the source string.</p>
339 * <p>This iterator iterates over a sequence of collation elements
340 * that were built from the string. Because there isn't
341 * necessarily a one-to-one mapping from characters to collation
342 * elements, this doesn't mean the same thing as "return the
343 * collation element [or ordering priority] of the previous
344 * character in the string".</p>
346 * <p>This function updates the iterator's internal pointer to
347 * point to the collation element preceding the one it's currently
348 * pointing to and then returns that element, while next() returns
349 * the current element and then updates the pointer.</p>
351 * @return the previous collation element, or NULLORDER when the start of
352 * the iteration has been reached.
355 public int previous()
357 assert m_direction <= 0;
360 if (m_source_.getIndex() <= 0 && m_isForwards_) {
361 // if iterator is new or reset, we can immediate perform backwards
362 // iteration even when the offset is not right.
363 m_source_.setToLimit();
364 updateInternalState();
366 m_isForwards_ = false;
367 if (m_CEBufferSize_ > 0) {
368 if (m_CEBufferOffset_ > 0) {
369 return m_CEBuffer_[-- m_CEBufferOffset_];
372 m_CEBufferOffset_ = 0;
375 int result = NULLORDER;
378 int ch_int = previousChar();
379 if (ch_int == UCharacterIterator.DONE) {
383 if (m_collator_.m_isHiragana4_) {
384 m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
386 if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
387 result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
391 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
394 result = m_collator_.m_trie_.getLeadValue(ch);
396 if (RuleBasedCollator.isSpecial(result)) {
397 result = previousSpecial(m_collator_, result, ch);
399 if (result == CE_NOT_FOUND_) {
400 if (!isBackwardsStart()
401 && m_collator_.isContractionEnd(ch)) {
402 result = CE_CONTRACTION_;
405 if(RuleBasedCollator.UCA_ != null) {
406 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
410 if (RuleBasedCollator.isSpecial(result)) {
411 if(RuleBasedCollator.UCA_ != null) {
412 result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
417 } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
418 if(result == CE_NOT_FOUND_) {
419 result = previousImplicit(ch);
425 * Return the primary order of the specified collation element,
426 * i.e. the first 16 bits. This value is unsigned.
427 * @param ce the collation element
428 * @return the element's 16 bits primary order.
431 public final static int primaryOrder(int ce)
433 return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
434 >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
437 * Return the secondary order of the specified collation element,
438 * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
439 * @param ce the collation element
440 * @return the element's 8 bits secondary order
443 public final static int secondaryOrder(int ce)
445 return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
446 >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
450 * Return the tertiary order of the specified collation element, i.e. the last
451 * 8 bits. This value is unsigned.
452 * @param ce the collation element
453 * @return the element's 8 bits tertiary order
456 public final static int tertiaryOrder(int ce)
458 return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
462 * <p> Sets the iterator to point to the collation element
463 * corresponding to the character at the specified offset. The
464 * value returned by the next call to next() will be the collation
465 * element corresponding to the characters at offset.</p>
467 * <p>If offset is in the middle of a contracting character
468 * sequence, the iterator is adjusted to the start of the
469 * contracting sequence. This means that getOffset() is not
470 * guaranteed to return the same value set by this method.</p>
472 * <p>If the decomposition mode is on, and offset is in the middle
473 * of a decomposible range of source text, the iterator may not
474 * return a correct result for the next forwards or backwards
475 * iteration. The user must ensure that the offset is not in the
476 * middle of a decomposible range.</p>
478 * @param offset the character offset into the original source string to
479 * set. Note that this is not an offset into the corresponding
480 * sequence of collation elements.
483 public void setOffset(int offset)
485 m_direction = 0; // reset to initial state
487 m_source_.setIndex(offset);
488 int ch_int = m_source_.current();
489 char ch = (char)ch_int;
490 if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
491 // if it is unsafe we need to check if it is part of a contraction
492 // or a surrogate character
493 if (UTF16.isTrailSurrogate(ch)) {
494 // if it is a surrogate pair we move up one character
495 char prevch = (char)m_source_.previous();
496 if (!UTF16.isLeadSurrogate(prevch)) {
497 m_source_.setIndex(offset); // go back to the same index
501 // could be part of a contraction
502 // backup to a safe point and iterate till we pass offset
503 while (m_source_.getIndex() > 0) {
504 if (!m_collator_.isUnsafe(ch)) {
507 ch = (char)m_source_.previous();
509 updateInternalState();
511 while (m_source_.getIndex() <= offset) {
512 prevoffset = m_source_.getIndex();
515 m_source_.setIndex(prevoffset);
518 updateInternalState();
519 // direction code to prevent next and previous from returning a
520 // character if we are already at the ends
521 offset = m_source_.getIndex();
522 if (offset == 0/* m_source_.getBeginIndex() */) {
523 // preventing previous() from returning characters from the end of
524 // the string again if we are at the beginning
525 m_isForwards_ = false;
527 else if (offset == m_source_.getLength()) {
528 // preventing next() from returning characters from the start of
529 // the string again if we are at the end
530 m_isForwards_ = true;
535 * <p>Set a new source string for iteration, and reset the offset
536 * to the beginning of the text.</p>
538 * @param source the new source string for iteration.
541 public void setText(String source)
543 m_srcUtilIter_.setText(source);
544 m_source_ = m_srcUtilIter_;
545 updateInternalState();
547 m_direction = 0; // reset to initial state
551 * <p>Set a new source string iterator for iteration, and reset the
552 * offset to the beginning of the text.
554 * <p>The source iterator's integrity will be preserved since a new copy
555 * will be created for use.</p>
556 * @param source the new source string iterator for iteration.
559 public void setText(UCharacterIterator source)
561 m_srcUtilIter_.setText(source.getText());
562 m_source_ = m_srcUtilIter_;
563 updateInternalState();
565 m_direction = 0; // reset to initial state
569 * <p>Set a new source string iterator for iteration, and reset the
570 * offset to the beginning of the text.
572 * @param source the new source string iterator for iteration.
575 public void setText(CharacterIterator source)
577 m_source_ = new CharacterIteratorWrapper(source);
578 m_source_.setToStart();
579 updateInternalState();
581 m_direction = 0; // reset to initial state
584 // public miscellaneous methods -----------------------------------------
587 * Tests that argument object is equals to this CollationElementIterator.
588 * Iterators are equal if the objects uses the same RuleBasedCollator,
589 * the same source text and have the same current position in iteration.
590 * @param that object to test if it is equals to this
591 * CollationElementIterator
594 public boolean equals(Object that)
599 if (that instanceof CollationElementIterator) {
600 CollationElementIterator thatceiter
601 = (CollationElementIterator)that;
602 if (!m_collator_.equals(thatceiter.m_collator_)) {
606 return m_source_.getIndex() == thatceiter.m_source_.getIndex()
607 && m_source_.getText().equals(
608 thatceiter.m_source_.getText());
614 * Mock implementation of hashCode(). This implementation always returns a constant
615 * value. When Java assertion is enabled, this method triggers an assertion failure.
617 * @deprecated This API is ICU internal only.
619 public int hashCode() {
620 assert false : "hashCode not designed";
624 // package private constructors ------------------------------------------
626 private CollationElementIterator(RuleBasedCollator collator) {
627 m_utilStringBuffer_ = new StringBuilder();
628 m_collator_ = collator;
629 m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
630 m_buffer_ = new StringBuilder();
631 m_utilSpecialBackUp_ = new Backup();
635 * <p>CollationElementIterator constructor. This takes a source
636 * string and a RuleBasedCollator. The iterator will walk through
637 * the source string based on the rules defined by the
638 * collator. If the source string is empty, NULLORDER will be
639 * returned on the first call to next().</p>
641 * @param source the source string.
642 * @param collator the RuleBasedCollator
645 CollationElementIterator(String source, RuleBasedCollator collator)
648 m_source_ = m_srcUtilIter_ = new StringUCharacterIterator(source);
649 updateInternalState();
653 * <p>CollationElementIterator constructor. This takes a source
654 * character iterator and a RuleBasedCollator. The iterator will
655 * walk through the source string based on the rules defined by
656 * the collator. If the source string is empty, NULLORDER will be
657 * returned on the first call to next().</p>
659 * @param source the source string iterator.
660 * @param collator the RuleBasedCollator
663 CollationElementIterator(CharacterIterator source,
664 RuleBasedCollator collator)
667 m_srcUtilIter_ = new StringUCharacterIterator();
668 m_source_ = new CharacterIteratorWrapper(source);
669 updateInternalState();
673 * <p>CollationElementIterator constructor. This takes a source
674 * character iterator and a RuleBasedCollator. The iterator will
675 * walk through the source string based on the rules defined by
676 * the collator. If the source string is empty, NULLORDER will be
677 * returned on the first call to next().</p>
679 * @param source the source string iterator.
680 * @param collator the RuleBasedCollator
683 CollationElementIterator(UCharacterIterator source,
684 RuleBasedCollator collator)
687 m_srcUtilIter_ = new StringUCharacterIterator();
688 m_srcUtilIter_.setText(source.getText());
689 m_source_ = m_srcUtilIter_;
690 updateInternalState();
693 // package private data members -----------------------------------------
696 * true if current codepoint was Hiragana
698 boolean m_isCodePointHiragana_;
700 * Position in the original string that starts with a non-FCD sequence
704 * This is the CE from CEs buffer that should be returned.
705 * Initial value is 0.
706 * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
707 * backwards will end with m_CEBufferOffset_ == 0.
708 * The next/previous after we reach the end/beginning of the m_CEBuffer_
709 * will cause this value to be reset to 0.
711 int m_CEBufferOffset_;
714 * This is the position to which we have stored processed CEs.
715 * Initial value is 0.
716 * The next/previous after we reach the end/beginning of the m_CEBuffer_
717 * will cause this value to be reset to 0.
720 static final int CE_NOT_FOUND_ = 0xF0000000;
721 static final int CE_EXPANSION_TAG_ = 1;
722 static final int CE_CONTRACTION_TAG_ = 2;
724 * Collate Digits As Numbers (CODAN) implementation
726 static final int CE_DIGIT_TAG_ = 13;
728 // package private methods ----------------------------------------------
731 * Sets the collator used.
732 * Internal use, all data members will be reset to the default values
733 * @param collator to set
735 void setCollator(RuleBasedCollator collator)
737 m_collator_ = collator;
738 updateInternalState();
742 * <p>Sets the iterator to point to the collation element corresponding to
743 * the specified character (the parameter is a CHARACTER offset in the
744 * original string, not an offset into its corresponding sequence of
745 * collation elements). The value returned by the next call to next()
746 * will be the collation element corresponding to the specified position
747 * in the text. Unlike the public method setOffset(int), this method does
748 * not try to readjust the offset to the start of a contracting sequence.
749 * getOffset() is guaranteed to return the same value as was passed to a
750 * preceding call to setOffset().</p>
751 * @param offset new character offset into the original text to set.
753 void setExactOffset(int offset)
755 m_source_.setIndex(offset);
756 updateInternalState();
758 m_direction = 0; // reset to initial state
762 * Checks if iterator is in the buffer zone
763 * @return true if iterator is in buffer zone, false otherwise
767 return m_bufferOffset_ > 0;
772 * <p>Sets the iterator to point to the collation element corresponding to
773 * the specified character (the parameter is a CHARACTER offset in the
774 * original string, not an offset into its corresponding sequence of
775 * collation elements). The value returned by the next call to next()
776 * will be the collation element corresponding to the specified position
777 * in the text. Unlike the public method setOffset(int), this method does
778 * not try to readjust the offset to the start of a contracting sequence.
779 * getOffset() is guaranteed to return the same value as was passed to a
780 * preceding call to setOffset().</p>
782 * @param source the new source string iterator for iteration.
783 * @param offset to the source
785 void setText(UCharacterIterator source, int offset)
787 m_srcUtilIter_.setText(source.getText());
788 m_source_ = m_srcUtilIter_;
789 m_source_.setIndex(offset);
790 updateInternalState();
792 m_direction = 0; // reset to initial state
795 // private inner class --------------------------------------------------
800 private static final class Backup
802 // protected data members -------------------------------------------
805 * Backup non FCD sequence limit
807 protected int m_FCDLimit_;
809 * Backup non FCD sequence start
811 protected int m_FCDStart_;
813 * Backup if previous Codepoint is Hiragana quatenary
815 protected boolean m_isCodePointHiragana_;
817 * Backup buffer position
819 protected int m_bufferOffset_;
821 * Backup source iterator offset
823 protected int m_offset_;
825 * Backup buffer contents
827 protected StringBuffer m_buffer_;
829 // protected constructor --------------------------------------------
836 m_buffer_ = new StringBuffer();
839 // end inner class ------------------------------------------------------
842 * Direction of travel
844 private boolean m_isForwards_;
846 * Source string iterator
848 private UCharacterIterator m_source_;
850 * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
852 private int m_bufferOffset_;
854 * Buffer for temporary storage of normalized characters, discontiguous
855 * characters and Thai characters
857 private StringBuilder m_buffer_;
859 * Position in the original string to continue forward FCD check from.
861 private int m_FCDLimit_;
863 * The collator this iterator is based on
865 private RuleBasedCollator m_collator_;
867 * true if Hiragana quatenary is on
869 //private boolean m_isHiragana4_;
873 private int m_CEBuffer_[];
875 * In reality we should not have to deal with expansion sequences longer
876 * then 16. However this value can be change if a bigger buffer is needed.
877 * Note, if the size is change to too small a number, BIG trouble.
878 * Reasonable small value is around 10, if there's no Arabic or other
879 * funky collations that have long expansion sequence. This is the longest
880 * expansion sequence this can handle without bombing out.
882 private static final int CE_BUFFER_INIT_SIZE_ = 512;
884 * Backup storage for special processing inner cases
886 private Backup m_utilSpecialBackUp_;
888 * Backup storage in special processing entry state
890 private Backup m_utilSpecialEntryBackUp_;
892 * Backup storage in special processing discontiguous state
894 private Backup m_utilSpecialDiscontiguousBackUp_;
898 private StringUCharacterIterator m_srcUtilIter_;
899 private StringBuilder m_utilStringBuffer_;
900 private StringBuilder m_utilSkippedBuffer_;
901 private CollationElementIterator m_utilColEIter_;
902 private static final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
903 private StringBuilder m_unnormalized_;
904 private Normalizer2Impl.ReorderingBuffer m_n2Buffer_;
906 * The first non-zero combining class character
908 private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
910 * One character before the first character with leading non-zero combining
913 private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
915 * Mask for the last byte
917 private static final int LAST_BYTE_MASK_ = 0xFF;
919 * Shift value for the second last byte
921 private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
923 // special ce values and tags -------------------------------------------
925 // private static final int CE_EXPANSION_ = 0xF1000000;
926 private static final int CE_CONTRACTION_ = 0xF2000000;
928 * Indicates the last ce has been consumed. Compare with NULLORDER.
929 * NULLORDER is returned if error occurs.
931 /* private static final int CE_NO_MORE_CES_ = 0x00010101;
932 private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
933 private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
934 private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
936 private static final int CE_NOT_FOUND_TAG_ = 0;
938 * Charset processing, not yet implemented
940 private static final int CE_CHARSET_TAG_ = 4;
944 private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
948 private static final int CE_LEAD_SURROGATE_TAG_ = 7;
952 private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
954 * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
956 private static final int CE_CJK_IMPLICIT_TAG_ = 9;
957 private static final int CE_IMPLICIT_TAG_ = 10;
958 static final int CE_SPEC_PROC_TAG_ = 11;
960 * This is a 3 byte primary with starting secondaries and tertiaries.
961 * It fits in a single 32 bit CE and is used instead of expansion to save
962 * space without affecting the performance (hopefully).
964 private static final int CE_LONG_PRIMARY_TAG_ = 12;
966 // private static final int CE_CE_TAGS_COUNT = 14;
967 private static final int CE_BYTE_COMMON_ = 0x05;
969 // end special ce values and tags ---------------------------------------
971 private static final int HANGUL_SBASE_ = 0xAC00;
972 private static final int HANGUL_LBASE_ = 0x1100;
973 private static final int HANGUL_VBASE_ = 0x1161;
974 private static final int HANGUL_TBASE_ = 0x11A7;
975 private static final int HANGUL_VCOUNT_ = 21;
976 private static final int HANGUL_TCOUNT_ = 28;
978 // CJK stuff ------------------------------------------------------------
980 /* private static final int CJK_BASE_ = 0x4E00;
981 private static final int CJK_LIMIT_ = 0x9FFF+1;
982 private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
983 private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
984 private static final int CJK_A_BASE_ = 0x3400;
985 private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
986 private static final int CJK_B_BASE_ = 0x20000;
987 private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
988 private static final int NON_CJK_OFFSET_ = 0x110000;
990 private static final boolean DEBUG = ICUDebug.enabled("collator");
992 // Field tracking the current direction. This field was added
993 // just for making sure that reset()/setOffset()/setText() is called
994 // before switching the iterator direction.
995 // We used to allow changing direction without calling reset()/setOffset()
996 // setText() in ICU4J, but the API specification was updated to match the
997 // ICU4C's specification. The current implementation seems to handle
998 // direction change (or not), but it will be completely replaced with
999 // a new implementation not allowing this. Until then, we use this field
1000 // to trigger assertion and make sure our implementation is not depending on
1001 // the assumption. See ticket#9104.
1002 private byte m_direction = 0; // -1: backward, 0: initial state, 1: forward
1004 // private methods ------------------------------------------------------
1007 * Reset the iterator internally
1009 private void updateInternalState()
1011 m_isCodePointHiragana_ = false;
1012 m_buffer_.setLength(0);
1013 m_bufferOffset_ = -1;
1014 m_CEBufferOffset_ = 0;
1015 m_CEBufferSize_ = 0;
1017 m_FCDStart_ = m_source_.getLength();
1018 //m_isHiragana4_ = m_collator_.m_isHiragana4_;
1019 m_isForwards_ = true;
1023 * Backup the current internal state
1024 * @param backup object to store the data
1026 private void backupInternalState(Backup backup)
1028 backup.m_offset_ = m_source_.getIndex();
1029 backup.m_FCDLimit_ = m_FCDLimit_;
1030 backup.m_FCDStart_ = m_FCDStart_;
1031 backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
1032 backup.m_bufferOffset_ = m_bufferOffset_;
1033 backup.m_buffer_.setLength(0);
1034 if (m_bufferOffset_ >= 0) {
1035 backup.m_buffer_.append(m_buffer_);
1040 * Update the iterator internally with backed-up state
1041 * @param backup object that stored the data
1043 private void updateInternalState(Backup backup)
1045 m_source_.setIndex(backup.m_offset_);
1046 m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
1047 m_bufferOffset_ = backup.m_bufferOffset_;
1048 m_FCDLimit_ = backup.m_FCDLimit_;
1049 m_FCDStart_ = backup.m_FCDStart_;
1050 m_buffer_.setLength(0);
1051 if (m_bufferOffset_ >= 0) {
1052 m_buffer_.append(backup.m_buffer_);
1057 * A fast combining class retrieval system.
1058 * @param ch UTF16 character
1059 * @return combining class of ch
1061 private int getCombiningClass(int ch)
1063 if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
1064 m_collator_.isUnsafe((char)ch) || ch > 0xFFFF
1066 return m_nfcImpl_.getCC(m_nfcImpl_.getNorm16(ch));
1072 * <p>Incremental normalization, this is an essential optimization.
1073 * Assuming FCD checks has been done, normalize the non-FCD characters into
1075 * Source offsets points to the current processing character.
1078 private void normalize()
1080 if (m_unnormalized_ == null) {
1081 m_unnormalized_ = new StringBuilder();
1082 m_n2Buffer_ = new Normalizer2Impl.ReorderingBuffer(m_nfcImpl_, m_buffer_, 10);
1084 m_unnormalized_.setLength(0);
1085 m_n2Buffer_.remove();
1087 int size = m_FCDLimit_ - m_FCDStart_;
1088 m_source_.setIndex(m_FCDStart_);
1089 for (int i = 0; i < size; i ++) {
1090 m_unnormalized_.append((char)m_source_.next());
1092 m_nfcImpl_.decomposeShort(m_unnormalized_, 0, size, m_n2Buffer_);
1096 * <p>Incremental FCD check and normalization. Gets the next base character
1097 * position and determines if the in-between characters needs normalization.
1099 * <p>When entering, the state is known to be this:
1101 * <li>We are working on source string, not the buffer.
1102 * <li>The leading combining class from the current character is 0 or the
1103 * trailing combining class of the previous char was zero.
1105 * Incoming source offsets points to the current processing character.
1106 * Return source offsets points to the current processing character.
1108 * @param ch current character (lead unit)
1109 * @param offset offset of ch +1
1110 * @return true if FCDCheck passes, false otherwise
1112 private boolean FCDCheck(int ch, int offset)
1114 boolean result = true;
1116 // Get the trailing combining class of the current character.
1117 // If it's zero, we are OK.
1118 m_FCDStart_ = offset - 1;
1119 m_source_.setIndex(offset);
1123 fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
1124 } else if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
1125 if (Character.isHighSurrogate((char)ch)) {
1126 int c2 = m_source_.next();
1128 fcd = 0; // end of input
1129 } else if (Character.isLowSurrogate((char)c2)) {
1130 fcd = m_nfcImpl_.getFCD16FromNormData(Character.toCodePoint((char)ch, (char)c2));
1132 m_source_.moveIndex(-1);
1136 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1142 int prevTrailCC = fcd & LAST_BYTE_MASK_;
1144 if (prevTrailCC == 0) {
1145 offset = m_source_.getIndex();
1147 // The current char has a non-zero trailing CC. Scan forward until
1148 // we find a char with a leading cc of zero.
1150 ch = m_source_.nextCodePoint();
1152 offset = m_source_.getIndex();
1156 fcd = m_nfcImpl_.getFCD16(ch);
1157 int leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1159 // this is a base character, we stop the FCD checks
1160 offset = m_source_.getIndex() - Character.charCount(ch);
1164 if (leadCC < prevTrailCC) {
1168 prevTrailCC = fcd & LAST_BYTE_MASK_;
1171 m_FCDLimit_ = offset;
1172 m_source_.setIndex(m_FCDStart_ + 1);
1177 * <p>Method tries to fetch the next character that is in fcd form.</p>
1178 * <p>Normalization is done if required.</p>
1179 * <p>Offsets are returned at the next character.</p>
1180 * @return next fcd character
1182 private int nextChar()
1186 // loop handles the next character whether it is in the buffer or not.
1187 if (m_bufferOffset_ < 0) {
1188 // we're working on the source and not normalizing. fast path.
1189 // note Thai pre-vowel reordering uses buffer too
1190 result = m_source_.next();
1193 // we are in the buffer, buffer offset will never be 0 here
1194 if (m_bufferOffset_ >= m_buffer_.length()) {
1195 // Null marked end of buffer, revert to the source string and
1196 // loop back to top to try again to get a character.
1197 m_source_.setIndex(m_FCDLimit_);
1198 m_bufferOffset_ = -1;
1199 m_buffer_.setLength(0);
1202 return m_buffer_.charAt(m_bufferOffset_ ++);
1204 int startoffset = m_source_.getIndex();
1205 if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1206 // Fast fcd safe path. trail combining class == 0.
1207 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1208 || m_bufferOffset_ >= 0 || m_FCDLimit_ >= startoffset) {
1209 // skip the fcd checks
1213 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1214 // We need to peek at the next character in order to tell if we are
1216 int next = m_source_.current();
1217 if (next == UCharacterIterator.DONE
1218 || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1219 return result; // end of source string and if next character
1220 // starts with a base character is always fcd.
1224 // Need a more complete FCD check and possible normalization.
1225 if (!FCDCheck(result, startoffset)) {
1227 result = m_buffer_.charAt(0);
1228 m_bufferOffset_ = 1;
1234 * <p>Incremental normalization, this is an essential optimization.
1235 * Assuming FCD checks has been done, normalize the non-FCD characters into
1237 * Source offsets points to the current processing character.</p>
1239 private void normalizeBackwards()
1242 m_bufferOffset_ = m_buffer_.length();
1246 * <p>Incremental backwards FCD check and normalization. Gets the previous
1247 * base character position and determines if the in-between characters
1248 * needs normalization.
1250 * <p>When entering, the state is known to be this:
1252 * <li>We are working on source string, not the buffer.
1253 * <li>The trailing combining class from the current character is 0 or the
1254 * leading combining class of the next char was zero.
1256 * Input source offsets points to the previous character.
1257 * Return source offsets points to the current processing character.
1259 * @param ch current character
1260 * @param offset current character offset
1261 * @return true if FCDCheck passes, false otherwise
1263 private boolean FCDCheckBackwards(int ch, int offset)
1266 m_FCDLimit_ = offset + 1;
1267 m_source_.setIndex(offset);
1269 fcd = m_nfcImpl_.getFCD16FromBelow180(ch);
1270 } else if (!Character.isLowSurrogate((char)ch)) {
1271 if (m_nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
1272 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1277 int c2 = m_source_.previous();
1279 fcd = 0; // start of input
1280 } else if (Character.isHighSurrogate((char)c2)) {
1281 ch = Character.toCodePoint((char)c2, (char)ch);
1282 fcd = m_nfcImpl_.getFCD16FromNormData(ch);
1285 m_source_.moveIndex(1);
1290 // Scan backward until we find a char with a leading cc of zero.
1291 boolean result = true;
1295 leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1296 if (leadCC == 0 || (ch = m_source_.previousCodePoint()) < 0) {
1297 offset = m_source_.getIndex();
1300 fcd = m_nfcImpl_.getFCD16(ch);
1301 int prevTrailCC = fcd & LAST_BYTE_MASK_;
1302 if (leadCC < prevTrailCC) {
1304 } else if (fcd == 0) {
1305 offset = m_source_.getIndex() + Character.charCount(ch);
1311 // storing character with 0 lead fcd or the 1st accent with a base
1312 // character before it
1313 m_FCDStart_ = offset;
1314 m_source_.setIndex(m_FCDLimit_);
1319 * <p>Method tries to fetch the previous character that is in fcd form.</p>
1320 * <p>Normalization is done if required.</p>
1321 * <p>Offsets are returned at the current character.</p>
1322 * @return previous fcd character
1324 private int previousChar()
1326 if (m_bufferOffset_ >= 0) {
1328 if (m_bufferOffset_ >= 0) {
1329 return m_buffer_.charAt(m_bufferOffset_);
1332 // At the start of buffer, route back to string.
1333 m_buffer_.setLength(0);
1334 if (m_FCDStart_ == 0) {
1336 m_source_.setIndex(0);
1337 return UCharacterIterator.DONE;
1340 m_FCDLimit_ = m_FCDStart_;
1341 m_source_.setIndex(m_FCDStart_);
1342 return previousChar();
1346 int result = m_source_.previous();
1347 int startoffset = m_source_.getIndex();
1348 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1349 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1350 || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
1353 int ch = m_source_.previous();
1354 if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1355 // if previous character is FCD
1359 // Need a more complete FCD check and possible normalization.
1360 if (!FCDCheckBackwards(result, startoffset)) {
1361 normalizeBackwards();
1363 result = m_buffer_.charAt(m_bufferOffset_);
1366 // fcd checks always reset m_source_ to the limit of the FCD
1367 m_source_.setIndex(startoffset);
1373 * Determines if it is at the start of source iteration
1374 * @return true if iterator at the start, false otherwise
1376 private final boolean isBackwardsStart()
1378 return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1379 || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1383 * Checks if iterator is at the end of its source string.
1384 * @return true if it is at the end, false otherwise
1386 private final boolean isEnd()
1388 if (m_bufferOffset_ >= 0) {
1389 if (m_bufferOffset_ != m_buffer_.length()) {
1393 // at end of buffer. check if fcd is at the end
1394 return m_FCDLimit_ == m_source_.getLength();
1397 return m_source_.getLength() == m_source_.getIndex();
1401 * <p>Special CE management for surrogates</p>
1402 * <p>Lead surrogate is encountered. CE to be retrieved by using the
1403 * following code unit. If the next code unit is a trail surrogate, both
1404 * units will be combined to retrieve the CE,
1405 * otherwise we treat it like an unassigned code point.</p>
1406 * @param collator collator to use
1407 * @param ce current CE
1408 * @param trail character
1409 * @return next CE for the surrogate characters
1411 private final int nextSurrogate(RuleBasedCollator collator, int ce,
1414 if (!UTF16.isTrailSurrogate(trail)) {
1415 updateInternalState(m_utilSpecialBackUp_);
1416 return CE_NOT_FOUND_;
1418 // TODO: CE contain the data from the previous CE + the mask.
1419 // It should at least be unmasked
1420 int result = collator.m_trie_.getTrailValue(ce, trail);
1421 if (result == CE_NOT_FOUND_) {
1422 updateInternalState(m_utilSpecialBackUp_);
1428 * Gets the CE expansion offset
1429 * @param collator current collator
1430 * @param ce ce to test
1431 * @return expansion offset
1433 private int getExpansionOffset(RuleBasedCollator collator, int ce)
1435 return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1440 * Gets the contraction ce offset
1441 * @param collator current collator
1442 * @param ce current ce
1443 * @return contraction offset
1445 private int getContractionOffset(RuleBasedCollator collator, int ce)
1447 return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1451 * Checks if CE is a special tag CE
1452 * @param ce to check
1453 * @return true if CE is a special tag CE, false otherwise
1455 private boolean isSpecialPrefixTag(int ce)
1457 return RuleBasedCollator.isSpecial(ce) &&
1458 RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1462 * <p>Special processing getting a CE that is preceded by a certain
1464 * <p>Used for optimizing Japanese length and iteration marks. When a
1465 * special processing tag is encountered, iterate backwards to see if
1466 * there's a match.</p>
1467 * <p>Contraction tables are used, prefix data is stored backwards in the
1469 * @param collator collator to use
1470 * @param ce current ce
1471 * @param entrybackup entry backup iterator status
1472 * @return next collation element
1474 private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1477 backupInternalState(m_utilSpecialBackUp_);
1478 updateInternalState(entrybackup);
1480 // We want to look at the character where we entered
1483 // This loop will run once per source string character, for as
1484 // long as we are matching a potential contraction sequence
1485 // First we position ourselves at the begining of contraction
1487 int entryoffset = getContractionOffset(collator, ce);
1488 int offset = entryoffset;
1489 if (isBackwardsStart()) {
1490 ce = collator.m_contractionCE_[offset];
1493 char previous = (char)previousChar();
1494 while (previous > collator.m_contractionIndex_[offset]) {
1495 // contraction characters are ordered, skip smaller characters
1499 if (previous == collator.m_contractionIndex_[offset]) {
1500 // Found the source string char in the table.
1501 // Pick up the corresponding CE from the table.
1502 ce = collator.m_contractionCE_[offset];
1505 // Source string char was not in the table, prefix not found
1506 ce = collator.m_contractionCE_[entryoffset];
1509 if (!isSpecialPrefixTag(ce)) {
1510 // The source string char was in the contraction table, and
1511 // the corresponding CE is not a prefix CE. We found the
1512 // prefix, break out of loop, this CE will end up being
1513 // returned. This is the normal way out of prefix handling
1514 // when the source actually contained the prefix.
1518 if (ce != CE_NOT_FOUND_) {
1519 // we found something and we can merilly continue
1520 updateInternalState(m_utilSpecialBackUp_);
1522 else { // prefix search was a failure, we have to backup all the way to
1524 updateInternalState(entrybackup);
1530 * Checks if the ce is a contraction tag
1531 * @param ce ce to check
1532 * @return true if ce is a contraction tag, false otherwise
1534 private boolean isContractionTag(int ce)
1536 return RuleBasedCollator.isSpecial(ce) &&
1537 RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1541 * Method to copy skipped characters into the buffer and sets the fcd
1542 * position. To ensure that the skipped characters are considered later,
1543 * we need to place it in the appropriate position in the buffer and
1544 * reassign the source index. simple case if index reside in string,
1545 * simply copy to buffer and fcdposition = pos, pos = start of buffer.
1546 * if pos in normalization buffer, we'll insert the copy infront of pos
1547 * and point pos to the start of the buffer. why am i doing these copies?
1548 * well, so that the whole chunk of codes in the getNextCE,
1549 * ucol_prv_getSpecialCE does not require any changes, which will be
1551 * @param skipped character buffer
1553 private void setDiscontiguous(StringBuilder skipped)
1555 if (m_bufferOffset_ >= 0) {
1556 m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1559 m_FCDLimit_ = m_source_.getIndex();
1560 m_buffer_.setLength(0);
1561 m_buffer_.append(skipped.toString());
1564 m_bufferOffset_ = 0;
1568 * Returns the current character for forward iteration
1569 * @return current character
1571 private int currentChar()
1573 if (m_bufferOffset_ < 0) {
1574 m_source_.previousCodePoint();
1575 return m_source_.nextCodePoint();
1578 // m_bufferOffset_ is never 0 in normal circumstances except after a
1579 // discontiguous contraction since it is always returned and moved
1580 // by 1 when we do nextChar()
1581 return UTF16.charAt(m_buffer_, m_bufferOffset_ - 1);
1585 * Method to get the discontiguous collation element within the source.
1586 * Note this function will set the position to the appropriate places.
1587 * Passed in character offset points to the second combining character
1588 * after the start character.
1589 * @param collator current collator used
1590 * @param entryoffset index to the start character in the contraction table
1591 * @return discontiguous collation element offset
1593 private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
1595 int offset = entryoffset;
1596 boolean multicontraction = false;
1597 // since it will be stuffed into this iterator and ran over again
1598 if (m_utilSkippedBuffer_ == null) {
1599 m_utilSkippedBuffer_ = new StringBuilder();
1602 m_utilSkippedBuffer_.setLength(0);
1604 int ch = currentChar();
1605 m_utilSkippedBuffer_.appendCodePoint(ch);
1607 int cc = getCombiningClass(ch);
1608 // accent after the first character
1609 if (m_utilSpecialDiscontiguousBackUp_ == null) {
1610 m_utilSpecialDiscontiguousBackUp_ = new Backup();
1612 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1613 boolean prevWasLead = false;
1615 // We read code units for contraction table matching
1616 // but have to get combining classes for code points
1617 // to figure out where to stop with discontiguous contraction.
1618 int ch_int = nextChar();
1619 char nextch = (char)ch_int;
1620 if (UTF16.isSurrogate(nextch)) {
1622 // trail surrogate of surrogate pair, keep previous and current cc
1623 prevWasLead = false;
1626 cc = 0; // default cc for an unpaired surrogate
1627 prevWasLead = false;
1628 if (Character.isHighSurrogate(nextch)) {
1629 int trail = nextChar();
1630 if (Character.isLowSurrogate((char)trail)) {
1631 cc = getCombiningClass(Character.toCodePoint(nextch, (char)trail));
1635 previousChar(); // restore index after having peeked at the next code unit
1641 cc = getCombiningClass(ch_int);
1642 prevWasLead = false;
1644 if (ch_int < 0 || cc == 0) {
1645 // if there are no more accents to move around
1646 // we don't have to shift previousChar, since we are resetting
1648 if (multicontraction) {
1650 previousChar(); // backtrack
1652 setDiscontiguous(m_utilSkippedBuffer_);
1653 return collator.m_contractionCE_[offset];
1658 offset ++; // skip the combining class offset
1659 while ((offset < collator.m_contractionIndex_.length) &&
1660 (nextch > collator.m_contractionIndex_[offset])) {
1664 int ce = CE_NOT_FOUND_;
1665 if ( offset >= collator.m_contractionIndex_.length) {
1668 if (nextch != collator.m_contractionIndex_[offset] || cc == prevCC) {
1669 // unmatched or blocked character
1670 if ( (m_utilSkippedBuffer_.length()!= 1) ||
1671 ((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
1672 (m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
1673 m_utilSkippedBuffer_.append(nextch);
1675 offset = entryoffset; // Restore the offset before checking next character.
1679 ce = collator.m_contractionCE_[offset];
1682 if (ce == CE_NOT_FOUND_) {
1685 else if (isContractionTag(ce)) {
1686 // this is a multi-contraction
1687 offset = getContractionOffset(collator, ce);
1688 if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1689 multicontraction = true;
1690 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1694 setDiscontiguous(m_utilSkippedBuffer_);
1699 updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1700 // backup is one forward of the base character, we need to move back
1703 return collator.m_contractionCE_[entryoffset];
1707 * Gets the next contraction ce
1708 * @param collator collator to use
1709 * @param ce current ce
1710 * @return ce of the next contraction
1712 private int nextContraction(RuleBasedCollator collator, int ce)
1714 backupInternalState(m_utilSpecialBackUp_);
1715 int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
1717 int entryoffset = getContractionOffset(collator, ce);
1718 int offset = entryoffset;
1721 ce = collator.m_contractionCE_[offset];
1722 if (ce == CE_NOT_FOUND_) {
1723 // back up the source over all the chars we scanned going
1724 // into this contraction.
1726 updateInternalState(m_utilSpecialBackUp_);
1731 // get the discontiguos maximum combining class
1732 int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1733 // checks if all characters have the same combining class
1734 byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
1735 char ch = (char)nextChar();
1737 while (ch > collator.m_contractionIndex_[offset]) {
1738 // contraction characters are ordered, skip all smaller
1742 if (ch == collator.m_contractionIndex_[offset]) {
1743 // Found the source string char in the contraction table.
1744 // Pick up the corresponding CE from the table.
1745 ce = collator.m_contractionCE_[offset];
1748 // Source string char was not in contraction table.
1749 // Unless it is a discontiguous contraction, we are done
1751 // ticket 8484 - porting changes from C for 6101
1752 // We test whether the next two char are surrogate pairs.
1753 // This test is done if the iterator is not in the end.
1754 // If there is no surrogate pair, the iterator
1755 // goes back one if needed.
1756 if(UTF16.isLeadSurrogate(ch) && !isEnd()) {
1757 char surrNextChar = (char)nextChar();
1758 if (UTF16.isTrailSurrogate(surrNextChar)) {
1759 miss = UCharacterProperty.getRawSupplementary(ch, surrNextChar);
1765 if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1766 || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
1768 // Contraction can not be discontiguous, back up by one
1773 ce = collator.m_contractionCE_[entryoffset];
1776 // Contraction is possibly discontiguous.
1777 // find the next character if ch is not a base character
1778 int ch_int = nextChar();
1779 if (ch_int != UCharacterIterator.DONE) {
1782 char nextch = (char)ch_int;
1783 if (getCombiningClass(nextch) == 0) {
1788 // base character not part of discontiguous contraction
1789 ce = collator.m_contractionCE_[entryoffset];
1792 ce = nextDiscontiguous(collator, entryoffset);
1797 if (ce == CE_NOT_FOUND_) {
1798 // source did not match the contraction, revert back original
1799 updateInternalState(m_utilSpecialBackUp_);
1804 // source was a contraction
1805 if (!isContractionTag(ce)) {
1809 // ccontinue looping to check for the remaining contraction.
1810 if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1811 // there are further contractions to be performed, so we store
1812 // the so-far completed ce, so that if we fail in the next
1813 // round we just return this one.
1814 entryce = collator.m_contractionCE_[entryoffset];
1815 backupInternalState(m_utilSpecialBackUp_);
1816 if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1817 m_utilSpecialBackUp_.m_bufferOffset_ --;
1820 m_utilSpecialBackUp_.m_offset_ --;
1828 * Gets the next ce for long primaries, stuffs the rest of the collation
1829 * elements into the ce buffer
1830 * @param ce current ce
1833 private int nextLongPrimary(int ce)
1835 m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1836 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1837 m_CEBufferOffset_ = 1;
1838 m_CEBufferSize_ = 2;
1839 m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
1841 return m_CEBuffer_[0];
1845 * Gets the number of expansion
1846 * @param ce current ce
1847 * @return number of expansion
1849 private int getExpansionCount(int ce)
1855 * Gets the next expansion ce and stuffs the rest of the collation elements
1856 * into the ce buffer
1857 * @param collator current collator
1858 * @param ce current ce
1859 * @return next expansion ce
1861 private int nextExpansion(RuleBasedCollator collator, int ce)
1863 // NOTE: we can encounter both continuations and expansions in an
1865 // I have to decide where continuations are going to be dealt with
1866 int offset = getExpansionOffset(collator, ce);
1867 m_CEBufferSize_ = getExpansionCount(ce);
1868 m_CEBufferOffset_ = 1;
1869 m_CEBuffer_[0] = collator.m_expansion_[offset];
1870 if (m_CEBufferSize_ != 0) {
1871 // if there are less than 16 elements in expansion
1872 for (int i = 1; i < m_CEBufferSize_; i ++) {
1873 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1877 // ce are terminated
1878 m_CEBufferSize_ = 1;
1879 while (collator.m_expansion_[offset] != 0) {
1880 m_CEBuffer_[m_CEBufferSize_ ++] =
1881 collator.m_expansion_[++ offset];
1884 // in case of one element expansion, we
1885 // want to immediately return CEpos
1886 if (m_CEBufferSize_ == 1) {
1887 m_CEBufferSize_ = 0;
1888 m_CEBufferOffset_ = 0;
1890 return m_CEBuffer_[0];
1894 * Gets the next digit ce
1895 * @param collator current collator
1896 * @param ce current collation element
1897 * @param cp current codepoint
1898 * @return next digit ce
1900 private int nextDigit(RuleBasedCollator collator, int ce, int cp)
1902 // We do a check to see if we want to collate digits as numbers;
1903 // if so we generate a custom collation key. Otherwise we pull out
1904 // the value stored in the expansion table.
1906 if (m_collator_.m_isNumericCollation_){
1908 int trailingZeroIndex = 0;
1909 boolean nonZeroValReached = false;
1911 // I just need a temporary place to store my generated CEs.
1912 // icu4c uses a unsigned byte array, i'll use a stringbuffer here
1913 // to avoid dealing with the sign problems and array allocation
1914 // clear and set initial string buffer length
1915 m_utilStringBuffer_.setLength(3);
1917 // We parse the source string until we hit a char that's NOT a
1919 // Use this u_charDigitValue. This might be slow because we have
1920 // to handle surrogates...
1921 int digVal = UCharacter.digit(cp);
1922 // if we have arrived here, we have already processed possible
1923 // supplementaries that trigered the digit tag -
1924 // all supplementaries are marked in the UCA.
1925 // We pad a zero in front of the first element anyways.
1926 // This takes care of the (probably) most common case where
1927 // people are sorting things followed by a single digit
1930 // Make sure we have enough space.
1931 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1932 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
1935 // Skipping over leading zeroes.
1936 if (digVal != 0 || nonZeroValReached) {
1937 if (digVal != 0 && !nonZeroValReached) {
1938 nonZeroValReached = true;
1940 // We parse the digit string into base 100 numbers
1941 // (this fits into a byte).
1942 // We only add to the buffer in twos, thus if we are
1943 // parsing an odd character, that serves as the
1944 // 'tens' digit while the if we are parsing an even
1945 // one, that is the 'ones' digit. We dumped the
1946 // parsed base 100 value (collateVal) into a buffer.
1947 // We multiply each collateVal by 2 (to give us room)
1948 // and add 5 (to avoid overlapping magic CE byte
1949 // values). The last byte we subtract 1 to ensure it is
1950 // less than all the other bytes.
1951 if (digIndx % 2 != 0) {
1952 collateVal += digVal;
1953 // This removes trailing zeroes.
1954 if (collateVal == 0 && trailingZeroIndex == 0) {
1955 trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1957 else if (trailingZeroIndex != 0) {
1958 trailingZeroIndex = 0;
1960 m_utilStringBuffer_.setCharAt(
1961 ((digIndx - 1) >>> 1) + 2,
1962 (char)((collateVal << 1) + 6));
1966 // We drop the collation value into the buffer so if
1967 // we need to do a "front patch" we don't have to
1968 // check to see if we're hitting the last element.
1970 collateVal = digVal * 10;
1971 if (collateVal == 0) {
1972 if (trailingZeroIndex != 0) {
1973 trailingZeroIndex = (digIndx >>> 1) + 2;
1976 trailingZeroIndex = 0;
1979 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
1980 (char)((collateVal << 1) + 6));
1985 // Get next character.
1987 backupInternalState(m_utilSpecialBackUp_);
1988 int char32 = nextChar();
1989 char ch = (char)char32;
1990 if (UTF16.isLeadSurrogate(ch)){
1992 char trail = (char)nextChar();
1993 if (UTF16.isTrailSurrogate(trail)) {
1994 char32 = UCharacterProperty.getRawSupplementary(
2003 digVal = UCharacter.digit(char32);
2005 // Resetting position to point to the next unprocessed
2006 // char. We overshot it when doing our test/set for
2008 updateInternalState(m_utilSpecialBackUp_);
2017 if (nonZeroValReached == false){
2019 m_utilStringBuffer_.setCharAt(2, (char)6);
2022 int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
2023 : (digIndx >>> 1) + 2;
2025 if (digIndx % 2 != 0){
2026 // We missed a value. Since digIndx isn't even, stuck too many
2027 // values into the buffer (this is what we get for padding the
2028 // first byte with a zero). "Front-patch" now by pushing all
2030 // Doing it this way ensures that at least 50% of the time
2031 // (statistically speaking) we'll only be doing a single pass
2032 // and optimizes for strings with single digits. I'm just
2033 // assuming that's the more common case.
2034 for (int i = 2; i < endIndex; i ++){
2035 m_utilStringBuffer_.setCharAt(i,
2036 (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
2038 + (((m_utilStringBuffer_.charAt(i + 1) - 6)
2039 >>> 1) / 10) << 1) + 6));
2044 // Subtract one off of the last byte.
2045 m_utilStringBuffer_.setCharAt(endIndex - 1,
2046 (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
2048 // We want to skip over the first two slots in the buffer.
2049 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2050 // The second slot is for the sign/exponent byte:
2051 // 0x80 + (decimalPos/2) & 7f.
2052 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2053 m_utilStringBuffer_.setCharAt(1,
2054 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2056 // Now transfer the collation key to our collIterate struct.
2057 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
2058 ce = (((m_utilStringBuffer_.charAt(0) << 8)
2060 | m_utilStringBuffer_.charAt(1))
2061 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2063 | (RuleBasedCollator.BYTE_COMMON_
2064 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2065 | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
2066 int i = 2; // Reset the index into the buffer.
2068 m_CEBuffer_[0] = ce;
2069 m_CEBufferSize_ = 1;
2070 m_CEBufferOffset_ = 1;
2071 while (i < endIndex)
2073 int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
2075 primWeight |= m_utilStringBuffer_.charAt(i ++);
2077 m_CEBuffer_[m_CEBufferSize_ ++]
2078 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2079 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2084 // no numeric mode, we'll just switch to whatever we stashed and
2086 // find the offset to expansion table
2087 return collator.m_expansion_[getExpansionOffset(collator, ce)];
2091 * Gets the next implicit ce for codepoints
2092 * @param codepoint current codepoint
2093 * @return implicit ce
2095 private int nextImplicit(int codepoint)
2097 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2098 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2100 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2101 m_CEBufferOffset_ = 1;
2102 m_CEBufferSize_ = 2;
2103 return m_CEBuffer_[0];
2107 * Returns the next ce associated with the following surrogate characters
2108 * @param ch current character
2111 private int nextSurrogate(char ch)
2113 int ch_int = nextChar();
2114 char nextch = (char)ch_int;
2115 if (ch_int != CharacterIterator.DONE &&
2116 UTF16.isTrailSurrogate(nextch)) {
2117 int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
2118 return nextImplicit(codepoint);
2120 if (nextch != CharacterIterator.DONE) {
2121 previousChar(); // reverts back to the original position
2123 return CE_NOT_FOUND_; // treat like unassigned
2127 * Returns the next ce for a hangul character, this is an implicit
2129 * @param collator current collator
2130 * @param ch current character
2133 private int nextHangul(RuleBasedCollator collator, char ch)
2135 char L = (char)(ch - HANGUL_SBASE_);
2137 // divide into pieces
2138 // do it in this order since some compilers can do % and / in one
2140 char T = (char)(L % HANGUL_TCOUNT_);
2141 L /= HANGUL_TCOUNT_;
2142 char V = (char)(L % HANGUL_VCOUNT_);
2143 L /= HANGUL_VCOUNT_;
2150 // return the first CE, but first put the rest into the expansion
2152 m_CEBufferSize_ = 0;
2153 if (!m_collator_.m_isJamoSpecial_) { // FAST PATH
2154 m_CEBuffer_[m_CEBufferSize_ ++] =
2155 collator.m_trie_.getLeadValue(L);
2156 m_CEBuffer_[m_CEBufferSize_ ++] =
2157 collator.m_trie_.getLeadValue(V);
2159 if (T != HANGUL_TBASE_) {
2160 m_CEBuffer_[m_CEBufferSize_ ++] =
2161 collator.m_trie_.getLeadValue(T);
2163 m_CEBufferOffset_ = 1;
2164 return m_CEBuffer_[0];
2168 // Since Hanguls pass the FCD check, it is guaranteed that we
2169 // won't be in the normalization buffer if something like this
2171 // Move Jamos into normalization buffer
2172 m_buffer_.append(L);
2173 m_buffer_.append(V);
2174 if (T != HANGUL_TBASE_) {
2175 m_buffer_.append(T);
2177 m_bufferOffset_ = 0;
2178 m_FCDLimit_ = m_source_.getIndex();
2179 m_FCDStart_ = m_FCDLimit_ - 1;
2180 // Indicate where to continue in main input string after
2181 // exhausting the buffer
2187 * <p>Special CE management. Expansions, contractions etc...</p>
2188 * @param collator can be plain UCA
2189 * @param ce current ce
2190 * @param ch current character
2191 * @return next special ce
2193 private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
2196 Backup entrybackup = m_utilSpecialEntryBackUp_;
2197 // this is to handle recursive looping
2198 if (entrybackup != null) {
2199 m_utilSpecialEntryBackUp_ = null;
2202 entrybackup = new Backup();
2204 backupInternalState(entrybackup);
2205 try { // forces it to assign m_utilSpecialEntryBackup_
2207 // This loop will repeat only in the case of contractions,
2209 switch(RuleBasedCollator.getTag(ce)) {
2210 case CE_NOT_FOUND_TAG_:
2211 // impossible case for icu4j
2213 case RuleBasedCollator.CE_SURROGATE_TAG_:
2215 return CE_NOT_FOUND_;
2217 backupInternalState(m_utilSpecialBackUp_);
2218 char trail = (char)nextChar();
2219 ce = nextSurrogate(collator, ce, trail);
2220 // calculate the supplementary code point value,
2221 // if surrogate was not tailored we go one more round
2223 UCharacterProperty.getRawSupplementary(ch, trail);
2225 case CE_SPEC_PROC_TAG_:
2226 ce = nextSpecialPrefix(collator, ce, entrybackup);
2228 case CE_CONTRACTION_TAG_:
2229 ce = nextContraction(collator, ce);
2231 case CE_LONG_PRIMARY_TAG_:
2232 return nextLongPrimary(ce);
2233 case CE_EXPANSION_TAG_:
2234 return nextExpansion(collator, ce);
2236 ce = nextDigit(collator, ce, codepoint);
2238 // various implicits optimization
2239 case CE_CJK_IMPLICIT_TAG_:
2240 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2241 return nextImplicit(codepoint);
2242 case CE_IMPLICIT_TAG_: // everything that is not defined
2243 return nextImplicit(codepoint);
2244 case CE_TRAIL_SURROGATE_TAG_:
2245 return CE_NOT_FOUND_; // DC00-DFFF broken surrogate, treat like unassigned
2246 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2247 return nextSurrogate(ch);
2248 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2249 return nextHangul(collator, ch);
2250 case CE_CHARSET_TAG_:
2251 // not yet implemented probably after 1.8
2252 return CE_NOT_FOUND_;
2255 // synwee todo, throw exception or something here.
2257 if (!RuleBasedCollator.isSpecial(ce)) {
2263 m_utilSpecialEntryBackUp_ = entrybackup;
2269 * Special processing is getting a CE that is preceded by a certain prefix.
2270 * Currently this is only needed for optimizing Japanese length and
2271 * iteration marks. When we encouter a special processing tag, we go
2272 * backwards and try to see if we have a match. Contraction tables are used
2273 * - so the whole process is not unlike contraction. prefix data is stored
2274 * backwards in the table.
2275 * @param collator current collator
2276 * @param ce current ce
2277 * @return previous ce
2279 private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
2281 backupInternalState(m_utilSpecialBackUp_);
2283 // position ourselves at the begining of contraction sequence
2284 int offset = getContractionOffset(collator, ce);
2285 int entryoffset = offset;
2286 if (isBackwardsStart()) {
2287 ce = collator.m_contractionCE_[offset];
2290 char prevch = (char)previousChar();
2291 while (prevch > collator.m_contractionIndex_[offset]) {
2292 // since contraction codepoints are ordered, we skip all that
2296 if (prevch == collator.m_contractionIndex_[offset]) {
2297 ce = collator.m_contractionCE_[offset];
2300 // if there is a completely ignorable code point in the middle
2301 // of a prefix, we need to act as if it's not there assumption:
2302 // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2304 // lone surrogates cannot be set to zero as it would break
2306 int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2307 // it's easy for BMP code points
2308 if (isZeroCE == 0) {
2311 else if (UTF16.isTrailSurrogate(prevch)
2312 || UTF16.isLeadSurrogate(prevch)) {
2313 // for supplementary code points, we have to check the next one
2314 // situations where we are going to ignore
2315 // 1. beginning of the string: schar is a lone surrogate
2316 // 2. schar is a lone surrogate
2317 // 3. schar is a trail surrogate in a valid surrogate
2318 // sequence that is explicitly set to zero.
2319 if (!isBackwardsStart()) {
2320 char lead = (char)previousChar();
2321 if (UTF16.isLeadSurrogate(lead)) {
2322 isZeroCE = collator.m_trie_.getLeadValue(lead);
2323 if (RuleBasedCollator.getTag(isZeroCE)
2324 == RuleBasedCollator.CE_SURROGATE_TAG_) {
2325 int finalCE = collator.m_trie_.getTrailValue(
2329 // this is a real, assigned completely
2330 // ignorable code point
2336 nextChar(); // revert to original offset
2337 // lone surrogate, completely ignorable
2340 nextChar(); // revert to original offset
2343 // lone surrogate at the beggining, completely ignorable
2348 // char was not in the table. prefix not found
2349 ce = collator.m_contractionCE_[entryoffset];
2352 if (!isSpecialPrefixTag(ce)) {
2353 // char was in the contraction table, and the corresponding ce
2354 // is not a prefix ce. We found the prefix, break out of loop,
2355 // this ce will end up being returned.
2359 updateInternalState(m_utilSpecialBackUp_);
2364 * Retrieves the previous contraction ce. To ensure that the backwards and
2365 * forwards iteration matches, we take the current region of most possible
2366 * match and pass it through the forward iteration. This will ensure that
2367 * the obstinate problem of overlapping contractions will not occur.
2368 * @param collator current collator
2369 * @param ce current ce
2370 * @param ch current character
2371 * @return previous contraction ce
2373 private int previousContraction(RuleBasedCollator collator, int ce, char ch)
2375 m_utilStringBuffer_.setLength(0);
2376 // since we might encounter normalized characters (from the thai
2377 // processing) we can't use peekCharacter() here.
2378 char prevch = (char)previousChar();
2379 boolean atStart = false;
2380 // TODO: address the comment above - maybe now we *can* use peekCharacter
2381 //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2382 while (collator.isUnsafe(ch)) {
2383 m_utilStringBuffer_.insert(0, ch);
2385 if (isBackwardsStart()) {
2389 prevch = (char)previousChar();
2392 // undo the previousChar() if we didn't reach the beginning
2395 // adds the initial base character to the string
2396 m_utilStringBuffer_.insert(0, ch);
2398 // a new collation element iterator is used to simply things, since
2399 // using the current collation element iterator will mean that the
2400 // forward and backwards iteration will share and change the same
2401 // buffers. it is going to be painful.
2402 int originaldecomp = collator.getDecomposition();
2403 // for faster access, since string would have been normalized above
2404 collator.setDecomposition(Collator.NO_DECOMPOSITION);
2405 if (m_utilColEIter_ == null) {
2406 m_utilColEIter_ = new CollationElementIterator(
2407 m_utilStringBuffer_.toString(),
2411 m_utilColEIter_.m_collator_ = collator;
2412 m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2414 ce = m_utilColEIter_.next();
2415 m_CEBufferSize_ = 0;
2416 while (ce != NULLORDER) {
2417 if (m_CEBufferSize_ == m_CEBuffer_.length) {
2419 // increasing cebuffer size
2420 int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2421 System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2422 m_CEBuffer_.length);
2423 m_CEBuffer_ = tempbuffer;
2425 catch( MissingResourceException e)
2429 catch (Exception e) {
2431 e.printStackTrace();
2436 m_CEBuffer_[m_CEBufferSize_ ++] = ce;
2437 ce = m_utilColEIter_.next();
2439 collator.setDecomposition(originaldecomp);
2440 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2441 return m_CEBuffer_[m_CEBufferOffset_];
2445 * Returns the previous long primary ces
2446 * @param ce long primary ce
2447 * @return previous long primary ces
2449 private int previousLongPrimary(int ce)
2451 m_CEBufferSize_ = 0;
2452 m_CEBuffer_[m_CEBufferSize_ ++] =
2453 ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2454 m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
2455 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2456 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2457 return m_CEBuffer_[m_CEBufferOffset_];
2461 * Returns the previous expansion ces
2462 * @param collator current collator
2463 * @param ce current ce
2464 * @return previous expansion ce
2466 private int previousExpansion(RuleBasedCollator collator, int ce)
2468 // find the offset to expansion table
2469 int offset = getExpansionOffset(collator, ce);
2470 m_CEBufferSize_ = getExpansionCount(ce);
2471 if (m_CEBufferSize_ != 0) {
2472 // less than 16 elements in expansion
2473 for (int i = 0; i < m_CEBufferSize_; i ++) {
2474 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2479 // null terminated ces
2480 while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2481 m_CEBuffer_[m_CEBufferSize_] =
2482 collator.m_expansion_[offset + m_CEBufferSize_];
2486 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2487 return m_CEBuffer_[m_CEBufferOffset_];
2491 * Getting the digit collation elements
2493 * @param ce current collation element
2494 * @param ch current code point
2495 * @return digit collation element
2497 private int previousDigit(RuleBasedCollator collator, int ce, char ch)
2499 // We do a check to see if we want to collate digits as numbers; if so we generate
2500 // a custom collation key. Otherwise we pull out the value stored in the expansion table.
2501 if (m_collator_.m_isNumericCollation_){
2502 int leadingZeroIndex = 0;
2504 boolean nonZeroValReached = false;
2506 // clear and set initial string buffer length
2507 m_utilStringBuffer_.setLength(3);
2509 // We parse the source string until we hit a char that's NOT a digit
2510 // Use this u_charDigitValue. This might be slow because we have to
2511 // handle surrogates...
2513 if (UTF16.isTrailSurrogate(ch)) {
2514 if (!isBackwardsStart()){
2515 char lead = (char)previousChar();
2516 if (UTF16.isLeadSurrogate(lead)) {
2517 char32 = UCharacterProperty.getRawSupplementary(lead,
2525 int digVal = UCharacter.digit(char32);
2528 // Make sure we have enough space.
2529 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2530 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
2533 // Skipping over "trailing" zeroes but we still add to digIndx.
2534 if (digVal != 0 || nonZeroValReached) {
2535 if (digVal != 0 && !nonZeroValReached) {
2536 nonZeroValReached = true;
2539 // We parse the digit string into base 100 numbers (this
2540 // fits into a byte).
2541 // We only add to the buffer in twos, thus if we are
2542 // parsing an odd character, that serves as the 'tens'
2543 // digit while the if we are parsing an even one, that is
2544 // the 'ones' digit. We dumped the parsed base 100 value
2545 // (collateVal) into a buffer. We multiply each collateVal
2546 // by 2 (to give us room) and add 5 (to avoid overlapping
2547 // magic CE byte values). The last byte we subtract 1 to
2548 // ensure it is less than all the other bytes.
2549 // Since we're doing in this reverse we want to put the
2550 // first digit encountered into the ones place and the
2551 // second digit encountered into the tens place.
2553 if (digIndx % 2 != 0){
2554 collateVal += digVal * 10;
2556 // This removes leading zeroes.
2557 if (collateVal == 0 && leadingZeroIndex == 0) {
2558 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2560 else if (leadingZeroIndex != 0) {
2561 leadingZeroIndex = 0;
2564 m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
2565 (char)((collateVal << 1) + 6));
2569 collateVal = digVal;
2574 if (!isBackwardsStart()){
2575 backupInternalState(m_utilSpecialBackUp_);
2576 char32 = previousChar();
2577 if (UTF16.isTrailSurrogate(ch)){
2578 if (!isBackwardsStart()) {
2579 char lead = (char)previousChar();
2580 if (UTF16.isLeadSurrogate(lead)) {
2582 = UCharacterProperty.getRawSupplementary(
2586 updateInternalState(m_utilSpecialBackUp_);
2591 digVal = UCharacter.digit(char32);
2593 updateInternalState(m_utilSpecialBackUp_);
2602 if (nonZeroValReached == false) {
2604 m_utilStringBuffer_.setCharAt(2, (char)6);
2607 if (digIndx % 2 != 0) {
2608 if (collateVal == 0 && leadingZeroIndex == 0) {
2609 // This removes the leading 0 in a odd number sequence of
2610 // numbers e.g. avery001
2611 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2614 // this is not a leading 0, we add it in
2615 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2616 (char)((collateVal << 1) + 6));
2621 int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
2622 : ((digIndx >>> 1) + 2) ;
2623 digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
2624 // Subtract one off of the last byte.
2625 // Really the first byte here, but it's reversed...
2626 m_utilStringBuffer_.setCharAt(2,
2627 (char)(m_utilStringBuffer_.charAt(2) - 1));
2628 // We want to skip over the first two slots in the buffer.
2629 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2630 // The second slot is for the sign/exponent byte:
2631 // 0x80 + (decimalPos/2) & 7f.
2632 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2633 m_utilStringBuffer_.setCharAt(1,
2634 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2636 // Now transfer the collation key to our collIterate struct.
2637 // The total size for our collation key is endIndx bumped up to the
2638 // next largest even value divided by two.
2639 m_CEBufferSize_ = 0;
2640 m_CEBuffer_[m_CEBufferSize_ ++]
2641 = (((m_utilStringBuffer_.charAt(0) << 8)
2643 | m_utilStringBuffer_.charAt(1))
2644 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2646 | (RuleBasedCollator.BYTE_COMMON_
2647 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2649 | RuleBasedCollator.BYTE_COMMON_;
2650 int i = endIndex - 1; // Reset the index into the buffer.
2652 int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
2654 primWeight |= m_utilStringBuffer_.charAt(i --);
2656 m_CEBuffer_[m_CEBufferSize_ ++]
2657 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2658 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2660 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2661 return m_CEBuffer_[m_CEBufferOffset_];
2664 return collator.m_expansion_[getExpansionOffset(collator, ce)];
2669 * Returns previous hangul ces
2670 * @param collator current collator
2671 * @param ch current character
2672 * @return previous hangul ce
2674 private int previousHangul(RuleBasedCollator collator, char ch)
2676 char L = (char)(ch - HANGUL_SBASE_);
2677 // we do it in this order since some compilers can do % and / in one
2679 char T = (char)(L % HANGUL_TCOUNT_);
2680 L /= HANGUL_TCOUNT_;
2681 char V = (char)(L % HANGUL_VCOUNT_);
2682 L /= HANGUL_VCOUNT_;
2689 m_CEBufferSize_ = 0;
2690 if (!m_collator_.m_isJamoSpecial_) {
2691 m_CEBuffer_[m_CEBufferSize_ ++] =
2692 collator.m_trie_.getLeadValue(L);
2693 m_CEBuffer_[m_CEBufferSize_ ++] =
2694 collator.m_trie_.getLeadValue(V);
2695 if (T != HANGUL_TBASE_) {
2696 m_CEBuffer_[m_CEBufferSize_ ++] =
2697 collator.m_trie_.getLeadValue(T);
2699 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2700 return m_CEBuffer_[m_CEBufferOffset_];
2703 // Since Hanguls pass the FCD check, it is guaranteed that we won't
2704 // be in the normalization buffer if something like this happens
2705 // Move Jamos into normalization buffer
2706 m_buffer_.append(L);
2707 m_buffer_.append(V);
2708 if (T != HANGUL_TBASE_) {
2709 m_buffer_.append(T);
2711 m_bufferOffset_ = m_buffer_.length();
2712 m_FCDStart_ = m_source_.getIndex();
2713 m_FCDLimit_ = m_FCDStart_ + 1;
2719 * Gets implicit codepoint ces
2720 * @param codepoint current codepoint
2721 * @return implicit codepoint ces
2723 private int previousImplicit(int codepoint)
2725 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2726 m_CEBufferSize_ = 2;
2727 m_CEBufferOffset_ = 1;
2728 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2730 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2731 return m_CEBuffer_[1];
2735 * Gets the previous surrogate ce
2736 * @param ch current character
2737 * @return previous surrogate ce
2739 private int previousSurrogate(char ch)
2741 if (isBackwardsStart()) {
2742 // we are at the start of the string, wrong place to be at
2743 return CE_NOT_FOUND_;
2745 char prevch = (char)previousChar();
2746 // Handles Han and Supplementary characters here.
2747 if (UTF16.isLeadSurrogate(prevch)) {
2748 return previousImplicit(
2749 UCharacterProperty.getRawSupplementary(prevch, ch));
2751 if (prevch != CharacterIterator.DONE) {
2754 return CE_NOT_FOUND_; // treat like unassigned
2758 * <p>Special CE management. Expansions, contractions etc...</p>
2759 * @param collator can be plain UCA
2760 * @param ce current ce
2761 * @param ch current character
2762 * @return previous special ce
2764 private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
2767 // the only ces that loops are thai, special prefix and
2769 switch (RuleBasedCollator.getTag(ce)) {
2770 case CE_NOT_FOUND_TAG_: // this tag always returns
2772 case RuleBasedCollator.CE_SURROGATE_TAG_: // unpaired lead surrogate
2773 return CE_NOT_FOUND_;
2774 case CE_SPEC_PROC_TAG_:
2775 ce = previousSpecialPrefix(collator, ce);
2777 case CE_CONTRACTION_TAG_:
2778 // may loop for first character e.g. "0x0f71" for english
2779 if (isBackwardsStart()) {
2780 // start of string or this is not the end of any contraction
2781 ce = collator.m_contractionCE_[
2782 getContractionOffset(collator, ce)];
2785 return previousContraction(collator, ce, ch); // else
2786 case CE_LONG_PRIMARY_TAG_:
2787 return previousLongPrimary(ce);
2788 case CE_EXPANSION_TAG_: // always returns
2789 return previousExpansion(collator, ce);
2791 ce = previousDigit(collator, ce, ch);
2793 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2794 return previousHangul(collator, ch);
2795 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2796 return CE_NOT_FOUND_; // broken surrogate sequence, treat like unassigned
2797 case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2798 return previousSurrogate(ch);
2799 case CE_CJK_IMPLICIT_TAG_:
2800 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2801 return previousImplicit(ch);
2802 case CE_IMPLICIT_TAG_: // everything that is not defined
2803 // UCA is filled with these. Tailorings are NOT_FOUND
2804 return previousImplicit(ch);
2805 case CE_CHARSET_TAG_: // this tag always returns
2806 return CE_NOT_FOUND_;
2807 default: // this tag always returns
2810 if (!RuleBasedCollator.isSpecial(ce)) {
2818 // * Gets a character from the source string at a given offset.
2819 // * Handles both normal and iterative cases.
2820 // * No error checking and does not access the normalization buffer
2821 // * - caller beware!
2822 // * @param offset offset from current position which character is to be
2824 // * @return character at current position + offset
2826 // private char peekCharacter(int offset)
2828 // if (offset != 0) {
2829 // int currentoffset = m_source_.getIndex();
2830 // m_source_.setIndex(currentoffset + offset);
2831 // char result = (char)m_source_.current();
2832 // m_source_.setIndex(currentoffset);
2836 // return (char)m_source_.current();
2841 * Moves back 1 position in the source string. This is slightly less
2842 * complicated than previousChar in that it doesn't normalize while
2843 * moving back. Boundary checks are not performed.
2844 * This method is to be used with caution, with the assumption that
2845 * moving back one position will not exceed the source limits.
2846 * Use only with nextChar() and never call this API twice in a row without
2847 * nextChar() in the middle.
2849 private void goBackOne()
2851 if (m_bufferOffset_ >= 0) {
2855 m_source_.setIndex(m_source_.getIndex() - 1);
2860 * Moves forward 1 position in the source string. This is slightly less
2861 * complicated than nextChar in that it doesn't normalize while
2862 * moving back. Boundary checks are not performed.
2863 * This method is to be used with caution, with the assumption that
2864 * moving back one position will not exceed the source limits.
2865 * Use only with previousChar() and never call this API twice in a row
2866 * without previousChar() in the middle.
2868 private void goForwardOne()
2870 if (m_bufferOffset_ < 0) {
2871 // we're working on the source and not normalizing. fast path.
2872 // note Thai pre-vowel reordering uses buffer too
2873 m_source_.setIndex(m_source_.getIndex() + 1);
2876 // we are in the buffer, buffer offset will never be 0 here