2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 *******************************************************************************
10 package com.ibm.icu.text;
13 * import java.text.StringCharacterIterator;
14 * import java.text.CharacterIterator;
16 import java.text.CharacterIterator;
17 import java.util.MissingResourceException;
19 import com.ibm.icu.impl.CharacterIteratorWrapper;
20 import com.ibm.icu.impl.ICUDebug;
21 import com.ibm.icu.impl.Norm2AllModes;
22 import com.ibm.icu.impl.Normalizer2Impl;
23 import com.ibm.icu.impl.StringUCharacterIterator;
24 import com.ibm.icu.impl.UCharacterProperty;
25 import com.ibm.icu.lang.UCharacter;
28 * <p><code>CollationElementIterator</code> is an iterator created by
29 * a RuleBasedCollator to walk through a string. The return result of
30 * each iteration is a 32-bit collation element that defines the
31 * ordering priority of the next character or sequence of characters
32 * in the source string.</p>
34 * <p>For illustration, consider the following in Spanish:
37 * "ca" -> the first collation element is collation_element('c') and second
38 * collation element is collation_element('a').
40 * Since "ch" in Spanish sorts as one entity, the below example returns one
41 * collation element for the two characters 'c' and 'h'
43 * "cha" -> the first collation element is collation_element('ch') and second
44 * collation element is collation_element('a').
50 * Since the character 'æ' is a composed character of 'a' and 'e', the
51 * iterator returns two collation elements for the single character 'æ'
53 * "æb" -> the first collation element is collation_element('a'), the
54 * second collation element is collation_element('e'), and the
55 * third collation element is collation_element('b').
60 * <p>For collation ordering comparison, the collation element results
61 * can not be compared simply by using basic arithmetric operators,
62 * e.g. <, == or >, further processing has to be done. Details
63 * can be found in the ICU
64 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
65 * user guide</a>. An example of using the CollationElementIterator
66 * for collation ordering comparison is the class
67 * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
69 * <p>To construct a CollationElementIterator object, users
70 * call the method getCollationElementIterator() on a
71 * RuleBasedCollator that defines the desired sorting order.</p>
76 * String testString = "This is a test";
77 * RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
78 * CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
79 * int primaryOrder = iterator.IGNORABLE;
80 * while (primaryOrder != iterator.NULLORDER) {
81 * int order = iterator.next();
82 * if (order != iterator.IGNORABLE &&
83 * order != iterator.NULLORDER) {
84 * // order is valid, not ignorable and we have not passed the end
85 * // of the iteration, we do something
86 * primaryOrder = CollationElementIterator.primaryOrder(order);
87 * System.out.println("Next primary order 0x" +
88 * Integer.toHexString(primaryOrder));
95 * This class is not subclassable
98 * @see RuleBasedCollator
100 * @author Syn Wee Quek
103 public final class CollationElementIterator
107 // public data members --------------------------------------------------
110 * <p>This constant is returned by the iterator in the methods
111 * next() and previous() when the end or the beginning of the
112 * source string has been reached, and there are no more valid
113 * collation elements to return.</p>
115 * <p>See class documentation for an example of use.</p>
119 public final static int NULLORDER = 0xffffffff;
122 * <p>This constant is returned by the iterator in the methods
123 * next() and previous() when a collation element result is to be
126 * <p>See class documentation for an example of use.</p>
130 public static final int IGNORABLE = 0;
132 // public methods -------------------------------------------------------
134 // public getters -------------------------------------------------------
137 * <p>Returns the character offset in the source string
138 * corresponding to the next collation element. I.e., getOffset()
139 * returns the position in the source string corresponding to the
140 * collation element that will be returned by the next call to
141 * next(). This value could be any of:
143 * <li> The index of the <b>first</b> character corresponding to
144 * the next collation element. (This means that if
145 * <code>setOffset(offset)</code> sets the index in the middle of
146 * a contraction, <code>getOffset()</code> returns the index of
147 * the first character in the contraction, which may not be equal
148 * to the original offset that was set. Hence calling getOffset()
149 * immediately after setOffset(offset) does not guarantee that the
150 * original offset set will be returned.)
151 * <li> If normalization is on, the index of the <b>immediate</b>
152 * subsequent character, or composite character with the first
153 * character, having a combining class of 0.
154 * <li> The length of the source string, if iteration has reached
158 * @return The character offset in the source string corresponding to the
159 * collation element that will be returned by the next call to
163 public int getOffset()
165 if (m_bufferOffset_ != -1) {
171 return m_source_.getIndex();
176 * <p> Returns the maximum length of any expansion sequence that ends with
177 * the specified collation element. If there is no expansion with this
178 * collation element as the last element, returns 1.
180 * @param ce a collation element returned by previous() or next().
181 * @return the maximum length of any expansion sequence ending
182 * with the specified collation element.
185 public int getMaxExpansion(int ce)
188 int limit = m_collator_.m_expansionEndCE_.length;
189 long unsignedce = ce & 0xFFFFFFFFl;
190 while (start < limit - 1) {
191 int mid = start + ((limit - start) >> 1);
192 long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
193 if (unsignedce <= midce) {
201 if (m_collator_.m_expansionEndCE_[start] == ce) {
202 result = m_collator_.m_expansionEndCEMaxSize_[start];
204 else if (limit < m_collator_.m_expansionEndCE_.length &&
205 m_collator_.m_expansionEndCE_[limit] == ce) {
206 result = m_collator_.m_expansionEndCEMaxSize_[limit];
208 else if ((ce & 0xFFFF) == 0x00C0) {
214 // public other methods -------------------------------------------------
217 * <p> Resets the cursor to the beginning of the string. The next
218 * call to next() or previous() will return the first and last
219 * collation element in the string, respectively.</p>
221 * <p>If the RuleBasedCollator used by this iterator has had its
222 * attributes changed, calling reset() will reinitialize the
223 * iterator to use the new attributes.</p>
229 m_source_.setToStart();
230 updateInternalState();
234 * <p>Get the next collation element in the source string.</p>
236 * <p>This iterator iterates over a sequence of collation elements
237 * that were built from the string. Because there isn't
238 * necessarily a one-to-one mapping from characters to collation
239 * elements, this doesn't mean the same thing as "return the
240 * collation element [or ordering priority] of the next character
241 * in the string".</p>
243 * <p>This function returns the collation element that the
244 * iterator is currently pointing to, and then updates the
245 * internal pointer to point to the next element. Previous()
246 * updates the pointer first, and then returns the element. This
247 * means that when you change direction while iterating (i.e.,
248 * call next() and then call previous(), or call previous() and
249 * then call next()), you'll get back the same element twice.</p>
251 * @return the next collation element or NULLORDER if the end of the
252 * iteration has been reached.
257 m_isForwards_ = true;
258 if (m_CEBufferSize_ > 0) {
259 if (m_CEBufferOffset_ < m_CEBufferSize_) {
260 // if there are expansions left in the buffer, we return it
261 return m_CEBuffer_[m_CEBufferOffset_ ++];
264 m_CEBufferOffset_ = 0;
267 int result = NULLORDER;
270 int ch_int = nextChar();
271 if (ch_int == UCharacterIterator.DONE) {
275 if (m_collator_.m_isHiragana4_) {
276 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
277 * based on whether the previous codepoint was Hiragana or Katakana.
279 m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
280 ((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
284 // For latin-1 characters we never need to fall back to the UCA
285 // table because all of the UCA data is replicated in the
286 // latinOneMapping array.
287 // Except: Special CEs can result in CE_NOT_FOUND_,
288 // for example if the default entry for a prefix-special is "not found",
289 // and we do need to fall back to the UCA in such a case.
290 // TODO: It would be better if tailoring specials never resulted in "not found"
291 // unless the corresponding UCA result is also "not found".
292 // That would require a change in the ICU4J collator-from-rule builder.
293 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
295 result = m_collator_.m_trie_.getLeadValue(ch);
297 if (!RuleBasedCollator.isSpecial(result)) {
300 if (result != CE_NOT_FOUND_) {
301 result = nextSpecial(m_collator_, result, ch);
303 if (result == CE_NOT_FOUND_) {
304 // couldn't find a good CE in the tailoring
305 if (RuleBasedCollator.UCA_ != null) {
306 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
307 if (RuleBasedCollator.isSpecial(result)) {
308 // UCA also gives us a special CE
309 result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
312 if(result == CE_NOT_FOUND_) {
313 // maybe there is no UCA, unlikely in Java, but ported for consistency
314 result = nextImplicit(ch);
317 } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
323 * <p>Get the previous collation element in the source string.</p>
325 * <p>This iterator iterates over a sequence of collation elements
326 * that were built from the string. Because there isn't
327 * necessarily a one-to-one mapping from characters to collation
328 * elements, this doesn't mean the same thing as "return the
329 * collation element [or ordering priority] of the previous
330 * character in the string".</p>
332 * <p>This function updates the iterator's internal pointer to
333 * point to the collation element preceding the one it's currently
334 * pointing to and then returns that element, while next() returns
335 * the current element and then updates the pointer. This means
336 * that when you change direction while iterating (i.e., call
337 * next() and then call previous(), or call previous() and then
338 * call next()), you'll get back the same element twice.</p>
340 * @return the previous collation element, or NULLORDER when the start of
341 * the iteration has been reached.
344 public int previous()
346 if (m_source_.getIndex() <= 0 && m_isForwards_) {
347 // if iterator is new or reset, we can immediate perform backwards
348 // iteration even when the offset is not right.
349 m_source_.setToLimit();
350 updateInternalState();
352 m_isForwards_ = false;
353 if (m_CEBufferSize_ > 0) {
354 if (m_CEBufferOffset_ > 0) {
355 return m_CEBuffer_[-- m_CEBufferOffset_];
358 m_CEBufferOffset_ = 0;
361 int result = NULLORDER;
364 int ch_int = previousChar();
365 if (ch_int == UCharacterIterator.DONE) {
369 if (m_collator_.m_isHiragana4_) {
370 m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
372 if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
373 result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
377 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
380 result = m_collator_.m_trie_.getLeadValue(ch);
382 if (RuleBasedCollator.isSpecial(result)) {
383 result = previousSpecial(m_collator_, result, ch);
385 if (result == CE_NOT_FOUND_) {
386 if (!isBackwardsStart()
387 && m_collator_.isContractionEnd(ch)) {
388 result = CE_CONTRACTION_;
391 if(RuleBasedCollator.UCA_ != null) {
392 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
396 if (RuleBasedCollator.isSpecial(result)) {
397 if(RuleBasedCollator.UCA_ != null) {
398 result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
403 } while (result == IGNORABLE && ch >= 0xAC00 && ch <= 0xD7AF);
404 if(result == CE_NOT_FOUND_) {
405 result = previousImplicit(ch);
411 * Return the primary order of the specified collation element,
412 * i.e. the first 16 bits. This value is unsigned.
413 * @param ce the collation element
414 * @return the element's 16 bits primary order.
417 public final static int primaryOrder(int ce)
419 return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
420 >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
423 * Return the secondary order of the specified collation element,
424 * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
425 * @param ce the collation element
426 * @return the element's 8 bits secondary order
429 public final static int secondaryOrder(int ce)
431 return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
432 >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
436 * Return the tertiary order of the specified collation element, i.e. the last
437 * 8 bits. This value is unsigned.
438 * @param ce the collation element
439 * @return the element's 8 bits tertiary order
442 public final static int tertiaryOrder(int ce)
444 return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
448 * <p> Sets the iterator to point to the collation element
449 * corresponding to the character at the specified offset. The
450 * value returned by the next call to next() will be the collation
451 * element corresponding to the characters at offset.</p>
453 * <p>If offset is in the middle of a contracting character
454 * sequence, the iterator is adjusted to the start of the
455 * contracting sequence. This means that getOffset() is not
456 * guaranteed to return the same value set by this method.</p>
458 * <p>If the decomposition mode is on, and offset is in the middle
459 * of a decomposible range of source text, the iterator may not
460 * return a correct result for the next forwards or backwards
461 * iteration. The user must ensure that the offset is not in the
462 * middle of a decomposible range.</p>
464 * @param offset the character offset into the original source string to
465 * set. Note that this is not an offset into the corresponding
466 * sequence of collation elements.
469 public void setOffset(int offset)
471 m_source_.setIndex(offset);
472 int ch_int = m_source_.current();
473 char ch = (char)ch_int;
474 if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
475 // if it is unsafe we need to check if it is part of a contraction
476 // or a surrogate character
477 if (UTF16.isTrailSurrogate(ch)) {
478 // if it is a surrogate pair we move up one character
479 char prevch = (char)m_source_.previous();
480 if (!UTF16.isLeadSurrogate(prevch)) {
481 m_source_.setIndex(offset); // go back to the same index
485 // could be part of a contraction
486 // backup to a safe point and iterate till we pass offset
487 while (m_source_.getIndex() > 0) {
488 if (!m_collator_.isUnsafe(ch)) {
491 ch = (char)m_source_.previous();
493 updateInternalState();
495 while (m_source_.getIndex() <= offset) {
496 prevoffset = m_source_.getIndex();
499 m_source_.setIndex(prevoffset);
502 updateInternalState();
503 // direction code to prevent next and previous from returning a
504 // character if we are already at the ends
505 offset = m_source_.getIndex();
506 if (offset == 0/* m_source_.getBeginIndex() */) {
507 // preventing previous() from returning characters from the end of
508 // the string again if we are at the beginning
509 m_isForwards_ = false;
511 else if (offset == m_source_.getLength()) {
512 // preventing next() from returning characters from the start of
513 // the string again if we are at the end
514 m_isForwards_ = true;
519 * <p>Set a new source string for iteration, and reset the offset
520 * to the beginning of the text.</p>
522 * @param source the new source string for iteration.
525 public void setText(String source)
527 m_srcUtilIter_.setText(source);
528 m_source_ = m_srcUtilIter_;
529 updateInternalState();
533 * <p>Set a new source string iterator for iteration, and reset the
534 * offset to the beginning of the text.
536 * <p>The source iterator's integrity will be preserved since a new copy
537 * will be created for use.</p>
538 * @param source the new source string iterator for iteration.
541 public void setText(UCharacterIterator source)
543 m_srcUtilIter_.setText(source.getText());
544 m_source_ = m_srcUtilIter_;
545 updateInternalState();
549 * <p>Set a new source string iterator for iteration, and reset the
550 * offset to the beginning of the text.
552 * @param source the new source string iterator for iteration.
555 public void setText(CharacterIterator source)
557 m_source_ = new CharacterIteratorWrapper(source);
558 m_source_.setToStart();
559 updateInternalState();
562 // public miscellaneous methods -----------------------------------------
565 * Tests that argument object is equals to this CollationElementIterator.
566 * Iterators are equal if the objects uses the same RuleBasedCollator,
567 * the same source text and have the same current position in iteration.
568 * @param that object to test if it is equals to this
569 * CollationElementIterator
572 public boolean equals(Object that)
577 if (that instanceof CollationElementIterator) {
578 CollationElementIterator thatceiter
579 = (CollationElementIterator)that;
580 if (!m_collator_.equals(thatceiter.m_collator_)) {
584 return m_source_.getIndex() == thatceiter.m_source_.getIndex()
585 && m_source_.getText().equals(
586 thatceiter.m_source_.getText());
591 // package private constructors ------------------------------------------
593 private CollationElementIterator(RuleBasedCollator collator) {
594 m_utilStringBuffer_ = new StringBuilder();
595 m_collator_ = collator;
596 m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
597 m_buffer_ = new StringBuilder();
598 m_utilSpecialBackUp_ = new Backup();
599 if (collator.getDecomposition() != Collator.NO_DECOMPOSITION) {
600 m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized
605 * <p>CollationElementIterator constructor. This takes a source
606 * string and a RuleBasedCollator. The iterator will walk through
607 * the source string based on the rules defined by the
608 * collator. If the source string is empty, NULLORDER will be
609 * returned on the first call to next().</p>
611 * @param source the source string.
612 * @param collator the RuleBasedCollator
615 CollationElementIterator(String source, RuleBasedCollator collator)
618 m_source_ = m_srcUtilIter_ = new StringUCharacterIterator(source);
619 updateInternalState();
623 * <p>CollationElementIterator constructor. This takes a source
624 * character iterator and a RuleBasedCollator. The iterator will
625 * walk through the source string based on the rules defined by
626 * the collator. If the source string is empty, NULLORDER will be
627 * returned on the first call to next().</p>
629 * @param source the source string iterator.
630 * @param collator the RuleBasedCollator
633 CollationElementIterator(CharacterIterator source,
634 RuleBasedCollator collator)
637 m_srcUtilIter_ = new StringUCharacterIterator();
638 m_source_ = new CharacterIteratorWrapper(source);
639 updateInternalState();
643 * <p>CollationElementIterator constructor. This takes a source
644 * character iterator and a RuleBasedCollator. The iterator will
645 * walk through the source string based on the rules defined by
646 * the collator. If the source string is empty, NULLORDER will be
647 * returned on the first call to next().</p>
649 * @param source the source string iterator.
650 * @param collator the RuleBasedCollator
653 CollationElementIterator(UCharacterIterator source,
654 RuleBasedCollator collator)
657 m_srcUtilIter_ = new StringUCharacterIterator();
658 m_srcUtilIter_.setText(source.getText());
659 m_source_ = m_srcUtilIter_;
660 updateInternalState();
663 // package private data members -----------------------------------------
666 * true if current codepoint was Hiragana
668 boolean m_isCodePointHiragana_;
670 * Position in the original string that starts with a non-FCD sequence
674 * This is the CE from CEs buffer that should be returned.
675 * Initial value is 0.
676 * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
677 * backwards will end with m_CEBufferOffset_ == 0.
678 * The next/previous after we reach the end/beginning of the m_CEBuffer_
679 * will cause this value to be reset to 0.
681 int m_CEBufferOffset_;
684 * This is the position to which we have stored processed CEs.
685 * Initial value is 0.
686 * The next/previous after we reach the end/beginning of the m_CEBuffer_
687 * will cause this value to be reset to 0.
690 static final int CE_NOT_FOUND_ = 0xF0000000;
691 static final int CE_EXPANSION_TAG_ = 1;
692 static final int CE_CONTRACTION_TAG_ = 2;
694 * Collate Digits As Numbers (CODAN) implementation
696 static final int CE_DIGIT_TAG_ = 13;
698 // package private methods ----------------------------------------------
701 * Sets the collator used.
702 * Internal use, all data members will be reset to the default values
703 * @param collator to set
705 void setCollator(RuleBasedCollator collator)
707 m_collator_ = collator;
708 updateInternalState();
712 * <p>Sets the iterator to point to the collation element corresponding to
713 * the specified character (the parameter is a CHARACTER offset in the
714 * original string, not an offset into its corresponding sequence of
715 * collation elements). The value returned by the next call to next()
716 * will be the collation element corresponding to the specified position
717 * in the text. Unlike the public method setOffset(int), this method does
718 * not try to readjust the offset to the start of a contracting sequence.
719 * getOffset() is guaranteed to return the same value as was passed to a
720 * preceding call to setOffset().</p>
721 * @param offset new character offset into the original text to set.
723 void setExactOffset(int offset)
725 m_source_.setIndex(offset);
726 updateInternalState();
730 * Checks if iterator is in the buffer zone
731 * @return true if iterator is in buffer zone, false otherwise
735 return m_bufferOffset_ > 0;
740 * <p>Sets the iterator to point to the collation element corresponding to
741 * the specified character (the parameter is a CHARACTER offset in the
742 * original string, not an offset into its corresponding sequence of
743 * collation elements). The value returned by the next call to next()
744 * will be the collation element corresponding to the specified position
745 * in the text. Unlike the public method setOffset(int), this method does
746 * not try to readjust the offset to the start of a contracting sequence.
747 * getOffset() is guaranteed to return the same value as was passed to a
748 * preceding call to setOffset().</p>
750 * @param source the new source string iterator for iteration.
751 * @param offset to the source
753 void setText(UCharacterIterator source, int offset)
755 m_srcUtilIter_.setText(source.getText());
756 m_source_ = m_srcUtilIter_;
757 m_source_.setIndex(offset);
758 updateInternalState();
761 // private inner class --------------------------------------------------
766 private static final class Backup
768 // protected data members -------------------------------------------
771 * Backup non FCD sequence limit
773 protected int m_FCDLimit_;
775 * Backup non FCD sequence start
777 protected int m_FCDStart_;
779 * Backup if previous Codepoint is Hiragana quatenary
781 protected boolean m_isCodePointHiragana_;
783 * Backup buffer position
785 protected int m_bufferOffset_;
787 * Backup source iterator offset
789 protected int m_offset_;
791 * Backup buffer contents
793 protected StringBuffer m_buffer_;
795 // protected constructor --------------------------------------------
802 m_buffer_ = new StringBuffer();
805 // end inner class ------------------------------------------------------
808 * Direction of travel
810 private boolean m_isForwards_;
812 * Source string iterator
814 private UCharacterIterator m_source_;
816 * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
818 private int m_bufferOffset_;
820 * Buffer for temporary storage of normalized characters, discontiguous
821 * characters and Thai characters
823 private StringBuilder m_buffer_;
825 * Position in the original string to continue forward FCD check from.
827 private int m_FCDLimit_;
829 * The collator this iterator is based on
831 private RuleBasedCollator m_collator_;
833 * true if Hiragana quatenary is on
835 //private boolean m_isHiragana4_;
839 private int m_CEBuffer_[];
841 * In reality we should not have to deal with expansion sequences longer
842 * then 16. However this value can be change if a bigger buffer is needed.
843 * Note, if the size is change to too small a number, BIG trouble.
844 * Reasonable small value is around 10, if there's no Arabic or other
845 * funky collations that have long expansion sequence. This is the longest
846 * expansion sequence this can handle without bombing out.
848 private static final int CE_BUFFER_INIT_SIZE_ = 512;
850 * Backup storage for special processing inner cases
852 private Backup m_utilSpecialBackUp_;
854 * Backup storage in special processing entry state
856 private Backup m_utilSpecialEntryBackUp_;
858 * Backup storage in special processing discontiguous state
860 private Backup m_utilSpecialDiscontiguousBackUp_;
864 private StringUCharacterIterator m_srcUtilIter_;
865 private StringBuilder m_utilStringBuffer_;
866 private StringBuilder m_utilSkippedBuffer_;
867 private CollationElementIterator m_utilColEIter_;
868 private static final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
869 private StringBuilder m_unnormalized_;
870 private Normalizer2Impl.ReorderingBuffer m_n2Buffer_;
872 * The first non-zero combining class character
874 private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
876 * One character before the first character with leading non-zero combining
879 private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
881 * Mask for the last byte
883 private static final int LAST_BYTE_MASK_ = 0xFF;
885 * Shift value for the second last byte
887 private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
889 // special ce values and tags -------------------------------------------
891 // private static final int CE_EXPANSION_ = 0xF1000000;
892 private static final int CE_CONTRACTION_ = 0xF2000000;
894 * Indicates the last ce has been consumed. Compare with NULLORDER.
895 * NULLORDER is returned if error occurs.
897 /* private static final int CE_NO_MORE_CES_ = 0x00010101;
898 private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
899 private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
900 private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
902 private static final int CE_NOT_FOUND_TAG_ = 0;
904 * Charset processing, not yet implemented
906 private static final int CE_CHARSET_TAG_ = 4;
910 private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
914 private static final int CE_LEAD_SURROGATE_TAG_ = 7;
918 private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
920 * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
922 private static final int CE_CJK_IMPLICIT_TAG_ = 9;
923 private static final int CE_IMPLICIT_TAG_ = 10;
924 static final int CE_SPEC_PROC_TAG_ = 11;
926 * This is a 3 byte primary with starting secondaries and tertiaries.
927 * It fits in a single 32 bit CE and is used instead of expansion to save
928 * space without affecting the performance (hopefully).
930 private static final int CE_LONG_PRIMARY_TAG_ = 12;
932 // private static final int CE_CE_TAGS_COUNT = 14;
933 private static final int CE_BYTE_COMMON_ = 0x05;
935 // end special ce values and tags ---------------------------------------
937 private static final int HANGUL_SBASE_ = 0xAC00;
938 private static final int HANGUL_LBASE_ = 0x1100;
939 private static final int HANGUL_VBASE_ = 0x1161;
940 private static final int HANGUL_TBASE_ = 0x11A7;
941 private static final int HANGUL_VCOUNT_ = 21;
942 private static final int HANGUL_TCOUNT_ = 28;
944 // CJK stuff ------------------------------------------------------------
946 /* private static final int CJK_BASE_ = 0x4E00;
947 private static final int CJK_LIMIT_ = 0x9FFF+1;
948 private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
949 private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
950 private static final int CJK_A_BASE_ = 0x3400;
951 private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
952 private static final int CJK_B_BASE_ = 0x20000;
953 private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
954 private static final int NON_CJK_OFFSET_ = 0x110000;
956 private static final boolean DEBUG = ICUDebug.enabled("collator");
958 // private methods ------------------------------------------------------
961 * Reset the iterator internally
963 private void updateInternalState()
965 m_isCodePointHiragana_ = false;
966 m_buffer_.setLength(0);
967 m_bufferOffset_ = -1;
968 m_CEBufferOffset_ = 0;
971 m_FCDStart_ = m_source_.getLength();
972 //m_isHiragana4_ = m_collator_.m_isHiragana4_;
973 m_isForwards_ = true;
977 * Backup the current internal state
978 * @param backup object to store the data
980 private void backupInternalState(Backup backup)
982 backup.m_offset_ = m_source_.getIndex();
983 backup.m_FCDLimit_ = m_FCDLimit_;
984 backup.m_FCDStart_ = m_FCDStart_;
985 backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
986 backup.m_bufferOffset_ = m_bufferOffset_;
987 backup.m_buffer_.setLength(0);
988 if (m_bufferOffset_ >= 0) {
989 backup.m_buffer_.append(m_buffer_);
994 * Update the iterator internally with backed-up state
995 * @param backup object that stored the data
997 private void updateInternalState(Backup backup)
999 m_source_.setIndex(backup.m_offset_);
1000 m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
1001 m_bufferOffset_ = backup.m_bufferOffset_;
1002 m_FCDLimit_ = backup.m_FCDLimit_;
1003 m_FCDStart_ = backup.m_FCDStart_;
1004 m_buffer_.setLength(0);
1005 if (m_bufferOffset_ >= 0) {
1006 m_buffer_.append(backup.m_buffer_);
1011 * A fast combining class retrieval system.
1012 * @param ch UTF16 character
1013 * @return combining class of ch
1015 private int getCombiningClass(int ch)
1017 if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
1018 m_collator_.isUnsafe((char)ch) || ch > 0xFFFF
1020 return m_nfcImpl_.getCC(m_nfcImpl_.getNorm16(ch));
1026 * <p>Incremental normalization, this is an essential optimization.
1027 * Assuming FCD checks has been done, normalize the non-FCD characters into
1029 * Source offsets points to the current processing character.
1032 private void normalize()
1034 if (m_unnormalized_ == null) {
1035 m_unnormalized_ = new StringBuilder();
1036 m_n2Buffer_ = new Normalizer2Impl.ReorderingBuffer(m_nfcImpl_, m_buffer_, 10);
1038 m_unnormalized_.setLength(0);
1039 m_n2Buffer_.remove();
1041 int size = m_FCDLimit_ - m_FCDStart_;
1042 m_source_.setIndex(m_FCDStart_);
1043 for (int i = 0; i < size; i ++) {
1044 m_unnormalized_.append((char)m_source_.next());
1046 m_nfcImpl_.decomposeShort(m_unnormalized_, 0, size, m_n2Buffer_);
1050 * <p>Incremental FCD check and normalization. Gets the next base character
1051 * position and determines if the in-between characters needs normalization.
1053 * <p>When entering, the state is known to be this:
1055 * <li>We are working on source string, not the buffer.
1056 * <li>The leading combining class from the current character is 0 or the
1057 * trailing combining class of the previous char was zero.
1059 * Incoming source offsets points to the current processing character.
1060 * Return source offsets points to the current processing character.
1062 * @param ch current character (lead unit)
1063 * @param offset offset of ch +1
1064 * @return true if FCDCheck passes, false otherwise
1066 private boolean FCDCheck(int ch, int offset)
1068 boolean result = true;
1070 // Get the trailing combining class of the current character.
1071 // If it's zero, we are OK.
1072 m_FCDStart_ = offset - 1;
1073 m_source_.setIndex(offset);
1075 int fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
1076 if (fcd != 0 && Character.isHighSurrogate((char)ch)) {
1077 int c2 = m_source_.next();
1079 fcd = 0; // end of input
1080 } else if (Character.isLowSurrogate((char)c2)) {
1081 fcd = m_nfcImpl_.getFCD16(Character.toCodePoint((char)ch, (char)c2));
1083 m_source_.moveIndex(-1);
1088 int prevTrailCC = fcd & LAST_BYTE_MASK_;
1090 if (prevTrailCC == 0) {
1091 offset = m_source_.getIndex();
1093 // The current char has a non-zero trailing CC. Scan forward until
1094 // we find a char with a leading cc of zero.
1096 ch = m_source_.nextCodePoint();
1098 offset = m_source_.getIndex();
1102 fcd = m_nfcImpl_.getFCD16(ch);
1103 int leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1105 // this is a base character, we stop the FCD checks
1106 offset = m_source_.getIndex() - Character.charCount(ch);
1110 if (leadCC < prevTrailCC) {
1114 prevTrailCC = fcd & LAST_BYTE_MASK_;
1117 m_FCDLimit_ = offset;
1118 m_source_.setIndex(m_FCDStart_ + 1);
1123 * <p>Method tries to fetch the next character that is in fcd form.</p>
1124 * <p>Normalization is done if required.</p>
1125 * <p>Offsets are returned at the next character.</p>
1126 * @return next fcd character
1128 private int nextChar()
1132 // loop handles the next character whether it is in the buffer or not.
1133 if (m_bufferOffset_ < 0) {
1134 // we're working on the source and not normalizing. fast path.
1135 // note Thai pre-vowel reordering uses buffer too
1136 result = m_source_.next();
1139 // we are in the buffer, buffer offset will never be 0 here
1140 if (m_bufferOffset_ >= m_buffer_.length()) {
1141 // Null marked end of buffer, revert to the source string and
1142 // loop back to top to try again to get a character.
1143 m_source_.setIndex(m_FCDLimit_);
1144 m_bufferOffset_ = -1;
1145 m_buffer_.setLength(0);
1148 return m_buffer_.charAt(m_bufferOffset_ ++);
1150 int startoffset = m_source_.getIndex();
1151 if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1152 // Fast fcd safe path. trail combining class == 0.
1153 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1154 || m_bufferOffset_ >= 0 || m_FCDLimit_ >= startoffset) {
1155 // skip the fcd checks
1159 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1160 // We need to peek at the next character in order to tell if we are
1162 int next = m_source_.current();
1163 if (next == UCharacterIterator.DONE
1164 || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1165 return result; // end of source string and if next character
1166 // starts with a base character is always fcd.
1170 // Need a more complete FCD check and possible normalization.
1171 if (!FCDCheck(result, startoffset)) {
1173 result = m_buffer_.charAt(0);
1174 m_bufferOffset_ = 1;
1180 * <p>Incremental normalization, this is an essential optimization.
1181 * Assuming FCD checks has been done, normalize the non-FCD characters into
1183 * Source offsets points to the current processing character.</p>
1185 private void normalizeBackwards()
1188 m_bufferOffset_ = m_buffer_.length();
1192 * <p>Incremental backwards FCD check and normalization. Gets the previous
1193 * base character position and determines if the in-between characters
1194 * needs normalization.
1196 * <p>When entering, the state is known to be this:
1198 * <li>We are working on source string, not the buffer.
1199 * <li>The trailing combining class from the current character is 0 or the
1200 * leading combining class of the next char was zero.
1202 * Input source offsets points to the previous character.
1203 * Return source offsets points to the current processing character.
1205 * @param ch current character
1206 * @param offset current character offset
1207 * @return true if FCDCheck passes, false otherwise
1209 private boolean FCDCheckBackwards(int ch, int offset)
1212 m_FCDLimit_ = offset + 1;
1213 m_source_.setIndex(offset);
1214 if (!UTF16.isSurrogate((char)ch)) {
1215 fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
1218 if (!Normalizer2Impl.UTF16Plus.isSurrogateLead(ch)) {
1219 int c2 = m_source_.previous();
1222 } else if (Character.isHighSurrogate((char)c2)) {
1223 ch = Character.toCodePoint((char)c2, (char)ch);
1224 fcd = m_nfcImpl_.getFCD16(ch);
1227 m_source_.moveIndex(1);
1232 // Scan backward until we find a char with a leading cc of zero.
1233 boolean result = true;
1237 leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1238 if (leadCC == 0 || (ch = m_source_.previousCodePoint()) < 0) {
1239 offset = m_source_.getIndex();
1242 fcd = m_nfcImpl_.getFCD16(ch);
1243 int prevTrailCC = fcd & LAST_BYTE_MASK_;
1244 if (leadCC < prevTrailCC) {
1246 } else if (fcd == 0) {
1247 offset = m_source_.getIndex() + Character.charCount(ch);
1253 // storing character with 0 lead fcd or the 1st accent with a base
1254 // character before it
1255 m_FCDStart_ = offset;
1256 m_source_.setIndex(m_FCDLimit_);
1261 * <p>Method tries to fetch the previous character that is in fcd form.</p>
1262 * <p>Normalization is done if required.</p>
1263 * <p>Offsets are returned at the current character.</p>
1264 * @return previous fcd character
1266 private int previousChar()
1268 if (m_bufferOffset_ >= 0) {
1270 if (m_bufferOffset_ >= 0) {
1271 return m_buffer_.charAt(m_bufferOffset_);
1274 // At the start of buffer, route back to string.
1275 m_buffer_.setLength(0);
1276 if (m_FCDStart_ == 0) {
1278 m_source_.setIndex(0);
1279 return UCharacterIterator.DONE;
1282 m_FCDLimit_ = m_FCDStart_;
1283 m_source_.setIndex(m_FCDStart_);
1284 return previousChar();
1288 int result = m_source_.previous();
1289 int startoffset = m_source_.getIndex();
1290 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1291 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1292 || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
1295 int ch = m_source_.previous();
1296 if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1297 // if previous character is FCD
1301 // Need a more complete FCD check and possible normalization.
1302 if (!FCDCheckBackwards(result, startoffset)) {
1303 normalizeBackwards();
1305 result = m_buffer_.charAt(m_bufferOffset_);
1308 // fcd checks always reset m_source_ to the limit of the FCD
1309 m_source_.setIndex(startoffset);
1315 * Determines if it is at the start of source iteration
1316 * @return true if iterator at the start, false otherwise
1318 private final boolean isBackwardsStart()
1320 return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1321 || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1325 * Checks if iterator is at the end of its source string.
1326 * @return true if it is at the end, false otherwise
1328 private final boolean isEnd()
1330 if (m_bufferOffset_ >= 0) {
1331 if (m_bufferOffset_ != m_buffer_.length()) {
1335 // at end of buffer. check if fcd is at the end
1336 return m_FCDLimit_ == m_source_.getLength();
1339 return m_source_.getLength() == m_source_.getIndex();
1343 * <p>Special CE management for surrogates</p>
1344 * <p>Lead surrogate is encountered. CE to be retrieved by using the
1345 * following code unit. If the next code unit is a trail surrogate, both
1346 * units will be combined to retrieve the CE,
1347 * otherwise we treat it like an unassigned code point.</p>
1348 * @param collator collator to use
1349 * @param ce current CE
1350 * @param trail character
1351 * @return next CE for the surrogate characters
1353 private final int nextSurrogate(RuleBasedCollator collator, int ce,
1356 if (!UTF16.isTrailSurrogate(trail)) {
1357 updateInternalState(m_utilSpecialBackUp_);
1358 return CE_NOT_FOUND_;
1360 // TODO: CE contain the data from the previous CE + the mask.
1361 // It should at least be unmasked
1362 int result = collator.m_trie_.getTrailValue(ce, trail);
1363 if (result == CE_NOT_FOUND_) {
1364 updateInternalState(m_utilSpecialBackUp_);
1370 * Gets the CE expansion offset
1371 * @param collator current collator
1372 * @param ce ce to test
1373 * @return expansion offset
1375 private int getExpansionOffset(RuleBasedCollator collator, int ce)
1377 return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1382 * Gets the contraction ce offset
1383 * @param collator current collator
1384 * @param ce current ce
1385 * @return contraction offset
1387 private int getContractionOffset(RuleBasedCollator collator, int ce)
1389 return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1393 * Checks if CE is a special tag CE
1394 * @param ce to check
1395 * @return true if CE is a special tag CE, false otherwise
1397 private boolean isSpecialPrefixTag(int ce)
1399 return RuleBasedCollator.isSpecial(ce) &&
1400 RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1404 * <p>Special processing getting a CE that is preceded by a certain
1406 * <p>Used for optimizing Japanese length and iteration marks. When a
1407 * special processing tag is encountered, iterate backwards to see if
1408 * there's a match.</p>
1409 * <p>Contraction tables are used, prefix data is stored backwards in the
1411 * @param collator collator to use
1412 * @param ce current ce
1413 * @param entrybackup entry backup iterator status
1414 * @return next collation element
1416 private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1419 backupInternalState(m_utilSpecialBackUp_);
1420 updateInternalState(entrybackup);
1422 // We want to look at the character where we entered
1425 // This loop will run once per source string character, for as
1426 // long as we are matching a potential contraction sequence
1427 // First we position ourselves at the begining of contraction
1429 int entryoffset = getContractionOffset(collator, ce);
1430 int offset = entryoffset;
1431 if (isBackwardsStart()) {
1432 ce = collator.m_contractionCE_[offset];
1435 char previous = (char)previousChar();
1436 while (previous > collator.m_contractionIndex_[offset]) {
1437 // contraction characters are ordered, skip smaller characters
1441 if (previous == collator.m_contractionIndex_[offset]) {
1442 // Found the source string char in the table.
1443 // Pick up the corresponding CE from the table.
1444 ce = collator.m_contractionCE_[offset];
1447 // Source string char was not in the table, prefix not found
1448 ce = collator.m_contractionCE_[entryoffset];
1451 if (!isSpecialPrefixTag(ce)) {
1452 // The source string char was in the contraction table, and
1453 // the corresponding CE is not a prefix CE. We found the
1454 // prefix, break out of loop, this CE will end up being
1455 // returned. This is the normal way out of prefix handling
1456 // when the source actually contained the prefix.
1460 if (ce != CE_NOT_FOUND_) {
1461 // we found something and we can merilly continue
1462 updateInternalState(m_utilSpecialBackUp_);
1464 else { // prefix search was a failure, we have to backup all the way to
1466 updateInternalState(entrybackup);
1472 * Checks if the ce is a contraction tag
1473 * @param ce ce to check
1474 * @return true if ce is a contraction tag, false otherwise
1476 private boolean isContractionTag(int ce)
1478 return RuleBasedCollator.isSpecial(ce) &&
1479 RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1483 * Method to copy skipped characters into the buffer and sets the fcd
1484 * position. To ensure that the skipped characters are considered later,
1485 * we need to place it in the appropriate position in the buffer and
1486 * reassign the source index. simple case if index reside in string,
1487 * simply copy to buffer and fcdposition = pos, pos = start of buffer.
1488 * if pos in normalization buffer, we'll insert the copy infront of pos
1489 * and point pos to the start of the buffer. why am i doing these copies?
1490 * well, so that the whole chunk of codes in the getNextCE,
1491 * ucol_prv_getSpecialCE does not require any changes, which will be
1493 * @param skipped character buffer
1495 private void setDiscontiguous(StringBuilder skipped)
1497 if (m_bufferOffset_ >= 0) {
1498 m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1501 m_FCDLimit_ = m_source_.getIndex();
1502 m_buffer_.setLength(0);
1503 m_buffer_.append(skipped.toString());
1506 m_bufferOffset_ = 0;
1510 * Returns the current character for forward iteration
1511 * @return current character
1513 private int currentChar()
1515 if (m_bufferOffset_ < 0) {
1516 m_source_.previousCodePoint();
1517 return m_source_.nextCodePoint();
1520 // m_bufferOffset_ is never 0 in normal circumstances except after a
1521 // discontiguous contraction since it is always returned and moved
1522 // by 1 when we do nextChar()
1523 return UTF16.charAt(m_buffer_, m_bufferOffset_ - 1);
1527 * Method to get the discontiguous collation element within the source.
1528 * Note this function will set the position to the appropriate places.
1529 * Passed in character offset points to the second combining character
1530 * after the start character.
1531 * @param collator current collator used
1532 * @param entryoffset index to the start character in the contraction table
1533 * @return discontiguous collation element offset
1535 private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
1537 int offset = entryoffset;
1538 boolean multicontraction = false;
1539 // since it will be stuffed into this iterator and ran over again
1540 if (m_utilSkippedBuffer_ == null) {
1541 m_utilSkippedBuffer_ = new StringBuilder();
1544 m_utilSkippedBuffer_.setLength(0);
1546 int ch = currentChar();
1547 m_utilSkippedBuffer_.appendCodePoint(ch);
1549 int cc = getCombiningClass(ch);
1550 // accent after the first character
1551 if (m_utilSpecialDiscontiguousBackUp_ == null) {
1552 m_utilSpecialDiscontiguousBackUp_ = new Backup();
1554 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1555 boolean prevWasLead = false;
1557 // We read code units for contraction table matching
1558 // but have to get combining classes for code points
1559 // to figure out where to stop with discontiguous contraction.
1560 int ch_int = nextChar();
1561 char nextch = (char)ch_int;
1562 if (UTF16.isSurrogate(nextch)) {
1564 // trail surrogate of surrogate pair, keep previous and current cc
1565 prevWasLead = false;
1568 cc = 0; // default cc for an unpaired surrogate
1569 prevWasLead = false;
1570 if (Character.isHighSurrogate(nextch)) {
1571 int trail = nextChar();
1572 if (Character.isLowSurrogate((char)trail)) {
1573 cc = getCombiningClass(Character.toCodePoint(nextch, (char)trail));
1577 previousChar(); // restore index after having peeked at the next code unit
1583 cc = getCombiningClass(ch_int);
1584 prevWasLead = false;
1586 if (ch_int < 0 || cc == 0) {
1587 // if there are no more accents to move around
1588 // we don't have to shift previousChar, since we are resetting
1590 if (multicontraction) {
1592 previousChar(); // backtrack
1594 setDiscontiguous(m_utilSkippedBuffer_);
1595 return collator.m_contractionCE_[offset];
1600 offset ++; // skip the combining class offset
1601 while ((offset < collator.m_contractionIndex_.length) &&
1602 (nextch > collator.m_contractionIndex_[offset])) {
1606 int ce = CE_NOT_FOUND_;
1607 if ( offset >= collator.m_contractionIndex_.length) {
1610 if (nextch != collator.m_contractionIndex_[offset] || cc == prevCC) {
1611 // unmatched or blocked character
1612 if ( (m_utilSkippedBuffer_.length()!= 1) ||
1613 ((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
1614 (m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
1615 m_utilSkippedBuffer_.append(nextch);
1617 offset = entryoffset; // Restore the offset before checking next character.
1621 ce = collator.m_contractionCE_[offset];
1624 if (ce == CE_NOT_FOUND_) {
1627 else if (isContractionTag(ce)) {
1628 // this is a multi-contraction
1629 offset = getContractionOffset(collator, ce);
1630 if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1631 multicontraction = true;
1632 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1636 setDiscontiguous(m_utilSkippedBuffer_);
1641 updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1642 // backup is one forward of the base character, we need to move back
1645 return collator.m_contractionCE_[entryoffset];
1649 * Gets the next contraction ce
1650 * @param collator collator to use
1651 * @param ce current ce
1652 * @return ce of the next contraction
1654 private int nextContraction(RuleBasedCollator collator, int ce)
1656 backupInternalState(m_utilSpecialBackUp_);
1657 int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
1659 int entryoffset = getContractionOffset(collator, ce);
1660 int offset = entryoffset;
1663 ce = collator.m_contractionCE_[offset];
1664 if (ce == CE_NOT_FOUND_) {
1665 // back up the source over all the chars we scanned going
1666 // into this contraction.
1668 updateInternalState(m_utilSpecialBackUp_);
1673 // get the discontiguos maximum combining class
1674 int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1675 // checks if all characters have the same combining class
1676 byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
1677 char ch = (char)nextChar();
1679 while (ch > collator.m_contractionIndex_[offset]) {
1680 // contraction characters are ordered, skip all smaller
1684 if (ch == collator.m_contractionIndex_[offset]) {
1685 // Found the source string char in the contraction table.
1686 // Pick up the corresponding CE from the table.
1687 ce = collator.m_contractionCE_[offset];
1690 // Source string char was not in contraction table.
1691 // Unless it is a discontiguous contraction, we are done
1693 // ticket 8484 - porting changes from C for 6101
1694 // We test whether the next two char are surrogate pairs.
1695 // This test is done if the iterator is not in the end.
1696 // If there is no surrogate pair, the iterator
1697 // goes back one if needed.
1698 if(UTF16.isLeadSurrogate(ch) && !isEnd()) {
1699 char surrNextChar = (char)nextChar();
1700 if (UTF16.isTrailSurrogate(surrNextChar)) {
1701 miss = UCharacterProperty.getRawSupplementary(ch, surrNextChar);
1707 if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1708 || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
1710 // Contraction can not be discontiguous, back up by one
1715 ce = collator.m_contractionCE_[entryoffset];
1718 // Contraction is possibly discontiguous.
1719 // find the next character if ch is not a base character
1720 int ch_int = nextChar();
1721 if (ch_int != UCharacterIterator.DONE) {
1724 char nextch = (char)ch_int;
1725 if (getCombiningClass(nextch) == 0) {
1730 // base character not part of discontiguous contraction
1731 ce = collator.m_contractionCE_[entryoffset];
1734 ce = nextDiscontiguous(collator, entryoffset);
1739 if (ce == CE_NOT_FOUND_) {
1740 // source did not match the contraction, revert back original
1741 updateInternalState(m_utilSpecialBackUp_);
1746 // source was a contraction
1747 if (!isContractionTag(ce)) {
1751 // ccontinue looping to check for the remaining contraction.
1752 if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1753 // there are further contractions to be performed, so we store
1754 // the so-far completed ce, so that if we fail in the next
1755 // round we just return this one.
1756 entryce = collator.m_contractionCE_[entryoffset];
1757 backupInternalState(m_utilSpecialBackUp_);
1758 if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1759 m_utilSpecialBackUp_.m_bufferOffset_ --;
1762 m_utilSpecialBackUp_.m_offset_ --;
1770 * Gets the next ce for long primaries, stuffs the rest of the collation
1771 * elements into the ce buffer
1772 * @param ce current ce
1775 private int nextLongPrimary(int ce)
1777 m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1778 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1779 m_CEBufferOffset_ = 1;
1780 m_CEBufferSize_ = 2;
1781 m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
1783 return m_CEBuffer_[0];
1787 * Gets the number of expansion
1788 * @param ce current ce
1789 * @return number of expansion
1791 private int getExpansionCount(int ce)
1797 * Gets the next expansion ce and stuffs the rest of the collation elements
1798 * into the ce buffer
1799 * @param collator current collator
1800 * @param ce current ce
1801 * @return next expansion ce
1803 private int nextExpansion(RuleBasedCollator collator, int ce)
1805 // NOTE: we can encounter both continuations and expansions in an
1807 // I have to decide where continuations are going to be dealt with
1808 int offset = getExpansionOffset(collator, ce);
1809 m_CEBufferSize_ = getExpansionCount(ce);
1810 m_CEBufferOffset_ = 1;
1811 m_CEBuffer_[0] = collator.m_expansion_[offset];
1812 if (m_CEBufferSize_ != 0) {
1813 // if there are less than 16 elements in expansion
1814 for (int i = 1; i < m_CEBufferSize_; i ++) {
1815 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1819 // ce are terminated
1820 m_CEBufferSize_ = 1;
1821 while (collator.m_expansion_[offset] != 0) {
1822 m_CEBuffer_[m_CEBufferSize_ ++] =
1823 collator.m_expansion_[++ offset];
1826 // in case of one element expansion, we
1827 // want to immediately return CEpos
1828 if (m_CEBufferSize_ == 1) {
1829 m_CEBufferSize_ = 0;
1830 m_CEBufferOffset_ = 0;
1832 return m_CEBuffer_[0];
1836 * Gets the next digit ce
1837 * @param collator current collator
1838 * @param ce current collation element
1839 * @param cp current codepoint
1840 * @return next digit ce
1842 private int nextDigit(RuleBasedCollator collator, int ce, int cp)
1844 // We do a check to see if we want to collate digits as numbers;
1845 // if so we generate a custom collation key. Otherwise we pull out
1846 // the value stored in the expansion table.
1848 if (m_collator_.m_isNumericCollation_){
1850 int trailingZeroIndex = 0;
1851 boolean nonZeroValReached = false;
1853 // I just need a temporary place to store my generated CEs.
1854 // icu4c uses a unsigned byte array, i'll use a stringbuffer here
1855 // to avoid dealing with the sign problems and array allocation
1856 // clear and set initial string buffer length
1857 m_utilStringBuffer_.setLength(3);
1859 // We parse the source string until we hit a char that's NOT a
1861 // Use this u_charDigitValue. This might be slow because we have
1862 // to handle surrogates...
1863 int digVal = UCharacter.digit(cp);
1864 // if we have arrived here, we have already processed possible
1865 // supplementaries that trigered the digit tag -
1866 // all supplementaries are marked in the UCA.
1867 // We pad a zero in front of the first element anyways.
1868 // This takes care of the (probably) most common case where
1869 // people are sorting things followed by a single digit
1872 // Make sure we have enough space.
1873 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1874 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
1877 // Skipping over leading zeroes.
1878 if (digVal != 0 || nonZeroValReached) {
1879 if (digVal != 0 && !nonZeroValReached) {
1880 nonZeroValReached = true;
1882 // We parse the digit string into base 100 numbers
1883 // (this fits into a byte).
1884 // We only add to the buffer in twos, thus if we are
1885 // parsing an odd character, that serves as the
1886 // 'tens' digit while the if we are parsing an even
1887 // one, that is the 'ones' digit. We dumped the
1888 // parsed base 100 value (collateVal) into a buffer.
1889 // We multiply each collateVal by 2 (to give us room)
1890 // and add 5 (to avoid overlapping magic CE byte
1891 // values). The last byte we subtract 1 to ensure it is
1892 // less than all the other bytes.
1893 if (digIndx % 2 == 1) {
1894 collateVal += digVal;
1895 // This removes trailing zeroes.
1896 if (collateVal == 0 && trailingZeroIndex == 0) {
1897 trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1899 else if (trailingZeroIndex != 0) {
1900 trailingZeroIndex = 0;
1902 m_utilStringBuffer_.setCharAt(
1903 ((digIndx - 1) >>> 1) + 2,
1904 (char)((collateVal << 1) + 6));
1908 // We drop the collation value into the buffer so if
1909 // we need to do a "front patch" we don't have to
1910 // check to see if we're hitting the last element.
1911 collateVal = digVal * 10;
1912 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
1913 (char)((collateVal << 1) + 6));
1918 // Get next character.
1920 backupInternalState(m_utilSpecialBackUp_);
1921 int char32 = nextChar();
1922 char ch = (char)char32;
1923 if (UTF16.isLeadSurrogate(ch)){
1925 char trail = (char)nextChar();
1926 if (UTF16.isTrailSurrogate(trail)) {
1927 char32 = UCharacterProperty.getRawSupplementary(
1936 digVal = UCharacter.digit(char32);
1938 // Resetting position to point to the next unprocessed
1939 // char. We overshot it when doing our test/set for
1941 updateInternalState(m_utilSpecialBackUp_);
1950 if (nonZeroValReached == false){
1952 m_utilStringBuffer_.setCharAt(2, (char)6);
1955 int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
1956 : (digIndx >>> 1) + 2;
1957 if (digIndx % 2 != 0){
1958 // We missed a value. Since digIndx isn't even, stuck too many
1959 // values into the buffer (this is what we get for padding the
1960 // first byte with a zero). "Front-patch" now by pushing all
1962 // Doing it this way ensures that at least 50% of the time
1963 // (statistically speaking) we'll only be doing a single pass
1964 // and optimizes for strings with single digits. I'm just
1965 // assuming that's the more common case.
1966 for (int i = 2; i < endIndex; i ++){
1967 m_utilStringBuffer_.setCharAt(i,
1968 (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
1970 + (((m_utilStringBuffer_.charAt(i + 1) - 6)
1971 >>> 1) / 10) << 1) + 6));
1976 // Subtract one off of the last byte.
1977 m_utilStringBuffer_.setCharAt(endIndex - 1,
1978 (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
1980 // We want to skip over the first two slots in the buffer.
1981 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
1982 // The second slot is for the sign/exponent byte:
1983 // 0x80 + (decimalPos/2) & 7f.
1984 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
1985 m_utilStringBuffer_.setCharAt(1,
1986 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
1988 // Now transfer the collation key to our collIterate struct.
1989 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
1990 ce = (((m_utilStringBuffer_.charAt(0) << 8)
1992 | m_utilStringBuffer_.charAt(1))
1993 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
1995 | (RuleBasedCollator.BYTE_COMMON_
1996 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
1997 | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
1998 int i = 2; // Reset the index into the buffer.
2000 m_CEBuffer_[0] = ce;
2001 m_CEBufferSize_ = 1;
2002 m_CEBufferOffset_ = 1;
2003 while (i < endIndex)
2005 int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
2007 primWeight |= m_utilStringBuffer_.charAt(i ++);
2009 m_CEBuffer_[m_CEBufferSize_ ++]
2010 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2011 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2016 // no numeric mode, we'll just switch to whatever we stashed and
2018 // find the offset to expansion table
2019 return collator.m_expansion_[getExpansionOffset(collator, ce)];
2023 * Gets the next implicit ce for codepoints
2024 * @param codepoint current codepoint
2025 * @return implicit ce
2027 private int nextImplicit(int codepoint)
2029 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2030 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2032 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2033 m_CEBufferOffset_ = 1;
2034 m_CEBufferSize_ = 2;
2035 return m_CEBuffer_[0];
2039 * Returns the next ce associated with the following surrogate characters
2040 * @param ch current character
2043 private int nextSurrogate(char ch)
2045 int ch_int = nextChar();
2046 char nextch = (char)ch_int;
2047 if (ch_int != CharacterIterator.DONE &&
2048 UTF16.isTrailSurrogate(nextch)) {
2049 int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
2050 return nextImplicit(codepoint);
2052 if (nextch != CharacterIterator.DONE) {
2053 previousChar(); // reverts back to the original position
2055 return CE_NOT_FOUND_; // treat like unassigned
2059 * Returns the next ce for a hangul character, this is an implicit
2061 * @param collator current collator
2062 * @param ch current character
2065 private int nextHangul(RuleBasedCollator collator, char ch)
2067 char L = (char)(ch - HANGUL_SBASE_);
2069 // divide into pieces
2070 // do it in this order since some compilers can do % and / in one
2072 char T = (char)(L % HANGUL_TCOUNT_);
2073 L /= HANGUL_TCOUNT_;
2074 char V = (char)(L % HANGUL_VCOUNT_);
2075 L /= HANGUL_VCOUNT_;
2082 // return the first CE, but first put the rest into the expansion
2084 m_CEBufferSize_ = 0;
2085 if (!m_collator_.m_isJamoSpecial_) { // FAST PATH
2086 m_CEBuffer_[m_CEBufferSize_ ++] =
2087 collator.m_trie_.getLeadValue(L);
2088 m_CEBuffer_[m_CEBufferSize_ ++] =
2089 collator.m_trie_.getLeadValue(V);
2091 if (T != HANGUL_TBASE_) {
2092 m_CEBuffer_[m_CEBufferSize_ ++] =
2093 collator.m_trie_.getLeadValue(T);
2095 m_CEBufferOffset_ = 1;
2096 return m_CEBuffer_[0];
2100 // Since Hanguls pass the FCD check, it is guaranteed that we
2101 // won't be in the normalization buffer if something like this
2103 // Move Jamos into normalization buffer
2104 m_buffer_.append(L);
2105 m_buffer_.append(V);
2106 if (T != HANGUL_TBASE_) {
2107 m_buffer_.append(T);
2109 m_bufferOffset_ = 0;
2110 m_FCDLimit_ = m_source_.getIndex();
2111 m_FCDStart_ = m_FCDLimit_ - 1;
2112 // Indicate where to continue in main input string after
2113 // exhausting the buffer
2119 * <p>Special CE management. Expansions, contractions etc...</p>
2120 * @param collator can be plain UCA
2121 * @param ce current ce
2122 * @param ch current character
2123 * @return next special ce
2125 private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
2128 Backup entrybackup = m_utilSpecialEntryBackUp_;
2129 // this is to handle recursive looping
2130 if (entrybackup != null) {
2131 m_utilSpecialEntryBackUp_ = null;
2134 entrybackup = new Backup();
2136 backupInternalState(entrybackup);
2137 try { // forces it to assign m_utilSpecialEntryBackup_
2139 // This loop will repeat only in the case of contractions,
2141 switch(RuleBasedCollator.getTag(ce)) {
2142 case CE_NOT_FOUND_TAG_:
2143 // impossible case for icu4j
2145 case RuleBasedCollator.CE_SURROGATE_TAG_:
2147 return CE_NOT_FOUND_;
2149 backupInternalState(m_utilSpecialBackUp_);
2150 char trail = (char)nextChar();
2151 ce = nextSurrogate(collator, ce, trail);
2152 // calculate the supplementary code point value,
2153 // if surrogate was not tailored we go one more round
2155 UCharacterProperty.getRawSupplementary(ch, trail);
2157 case CE_SPEC_PROC_TAG_:
2158 ce = nextSpecialPrefix(collator, ce, entrybackup);
2160 case CE_CONTRACTION_TAG_:
2161 ce = nextContraction(collator, ce);
2163 case CE_LONG_PRIMARY_TAG_:
2164 return nextLongPrimary(ce);
2165 case CE_EXPANSION_TAG_:
2166 return nextExpansion(collator, ce);
2168 ce = nextDigit(collator, ce, codepoint);
2170 // various implicits optimization
2171 case CE_CJK_IMPLICIT_TAG_:
2172 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2173 return nextImplicit(codepoint);
2174 case CE_IMPLICIT_TAG_: // everything that is not defined
2175 return nextImplicit(codepoint);
2176 case CE_TRAIL_SURROGATE_TAG_:
2177 return CE_NOT_FOUND_; // DC00-DFFF broken surrogate, treat like unassigned
2178 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2179 return nextSurrogate(ch);
2180 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2181 return nextHangul(collator, ch);
2182 case CE_CHARSET_TAG_:
2183 // not yet implemented probably after 1.8
2184 return CE_NOT_FOUND_;
2187 // synwee todo, throw exception or something here.
2189 if (!RuleBasedCollator.isSpecial(ce)) {
2195 m_utilSpecialEntryBackUp_ = entrybackup;
2201 * Special processing is getting a CE that is preceded by a certain prefix.
2202 * Currently this is only needed for optimizing Japanese length and
2203 * iteration marks. When we encouter a special processing tag, we go
2204 * backwards and try to see if we have a match. Contraction tables are used
2205 * - so the whole process is not unlike contraction. prefix data is stored
2206 * backwards in the table.
2207 * @param collator current collator
2208 * @param ce current ce
2209 * @return previous ce
2211 private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
2213 backupInternalState(m_utilSpecialBackUp_);
2215 // position ourselves at the begining of contraction sequence
2216 int offset = getContractionOffset(collator, ce);
2217 int entryoffset = offset;
2218 if (isBackwardsStart()) {
2219 ce = collator.m_contractionCE_[offset];
2222 char prevch = (char)previousChar();
2223 while (prevch > collator.m_contractionIndex_[offset]) {
2224 // since contraction codepoints are ordered, we skip all that
2228 if (prevch == collator.m_contractionIndex_[offset]) {
2229 ce = collator.m_contractionCE_[offset];
2232 // if there is a completely ignorable code point in the middle
2233 // of a prefix, we need to act as if it's not there assumption:
2234 // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2236 // lone surrogates cannot be set to zero as it would break
2238 int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2239 // it's easy for BMP code points
2240 if (isZeroCE == 0) {
2243 else if (UTF16.isTrailSurrogate(prevch)
2244 || UTF16.isLeadSurrogate(prevch)) {
2245 // for supplementary code points, we have to check the next one
2246 // situations where we are going to ignore
2247 // 1. beginning of the string: schar is a lone surrogate
2248 // 2. schar is a lone surrogate
2249 // 3. schar is a trail surrogate in a valid surrogate
2250 // sequence that is explicitly set to zero.
2251 if (!isBackwardsStart()) {
2252 char lead = (char)previousChar();
2253 if (UTF16.isLeadSurrogate(lead)) {
2254 isZeroCE = collator.m_trie_.getLeadValue(lead);
2255 if (RuleBasedCollator.getTag(isZeroCE)
2256 == RuleBasedCollator.CE_SURROGATE_TAG_) {
2257 int finalCE = collator.m_trie_.getTrailValue(
2261 // this is a real, assigned completely
2262 // ignorable code point
2268 nextChar(); // revert to original offset
2269 // lone surrogate, completely ignorable
2272 nextChar(); // revert to original offset
2275 // lone surrogate at the beggining, completely ignorable
2280 // char was not in the table. prefix not found
2281 ce = collator.m_contractionCE_[entryoffset];
2284 if (!isSpecialPrefixTag(ce)) {
2285 // char was in the contraction table, and the corresponding ce
2286 // is not a prefix ce. We found the prefix, break out of loop,
2287 // this ce will end up being returned.
2291 updateInternalState(m_utilSpecialBackUp_);
2296 * Retrieves the previous contraction ce. To ensure that the backwards and
2297 * forwards iteration matches, we take the current region of most possible
2298 * match and pass it through the forward iteration. This will ensure that
2299 * the obstinate problem of overlapping contractions will not occur.
2300 * @param collator current collator
2301 * @param ce current ce
2302 * @param ch current character
2303 * @return previous contraction ce
2305 private int previousContraction(RuleBasedCollator collator, int ce, char ch)
2307 m_utilStringBuffer_.setLength(0);
2308 // since we might encounter normalized characters (from the thai
2309 // processing) we can't use peekCharacter() here.
2310 char prevch = (char)previousChar();
2311 boolean atStart = false;
2312 // TODO: address the comment above - maybe now we *can* use peekCharacter
2313 //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2314 while (collator.isUnsafe(ch)) {
2315 m_utilStringBuffer_.insert(0, ch);
2317 if (isBackwardsStart()) {
2321 prevch = (char)previousChar();
2324 // undo the previousChar() if we didn't reach the beginning
2327 // adds the initial base character to the string
2328 m_utilStringBuffer_.insert(0, ch);
2330 // a new collation element iterator is used to simply things, since
2331 // using the current collation element iterator will mean that the
2332 // forward and backwards iteration will share and change the same
2333 // buffers. it is going to be painful.
2334 int originaldecomp = collator.getDecomposition();
2335 // for faster access, since string would have been normalized above
2336 collator.setDecomposition(Collator.NO_DECOMPOSITION);
2337 if (m_utilColEIter_ == null) {
2338 m_utilColEIter_ = new CollationElementIterator(
2339 m_utilStringBuffer_.toString(),
2343 m_utilColEIter_.m_collator_ = collator;
2344 m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2346 ce = m_utilColEIter_.next();
2347 m_CEBufferSize_ = 0;
2348 while (ce != NULLORDER) {
2349 if (m_CEBufferSize_ == m_CEBuffer_.length) {
2351 // increasing cebuffer size
2352 int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2353 System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2354 m_CEBuffer_.length);
2355 m_CEBuffer_ = tempbuffer;
2357 catch( MissingResourceException e)
2361 catch (Exception e) {
2363 e.printStackTrace();
2368 m_CEBuffer_[m_CEBufferSize_ ++] = ce;
2369 ce = m_utilColEIter_.next();
2371 collator.setDecomposition(originaldecomp);
2372 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2373 return m_CEBuffer_[m_CEBufferOffset_];
2377 * Returns the previous long primary ces
2378 * @param ce long primary ce
2379 * @return previous long primary ces
2381 private int previousLongPrimary(int ce)
2383 m_CEBufferSize_ = 0;
2384 m_CEBuffer_[m_CEBufferSize_ ++] =
2385 ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2386 m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
2387 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2388 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2389 return m_CEBuffer_[m_CEBufferOffset_];
2393 * Returns the previous expansion ces
2394 * @param collator current collator
2395 * @param ce current ce
2396 * @return previous expansion ce
2398 private int previousExpansion(RuleBasedCollator collator, int ce)
2400 // find the offset to expansion table
2401 int offset = getExpansionOffset(collator, ce);
2402 m_CEBufferSize_ = getExpansionCount(ce);
2403 if (m_CEBufferSize_ != 0) {
2404 // less than 16 elements in expansion
2405 for (int i = 0; i < m_CEBufferSize_; i ++) {
2406 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2411 // null terminated ces
2412 while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2413 m_CEBuffer_[m_CEBufferSize_] =
2414 collator.m_expansion_[offset + m_CEBufferSize_];
2418 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2419 return m_CEBuffer_[m_CEBufferOffset_];
2423 * Getting the digit collation elements
2425 * @param ce current collation element
2426 * @param ch current code point
2427 * @return digit collation element
2429 private int previousDigit(RuleBasedCollator collator, int ce, char ch)
2431 // We do a check to see if we want to collate digits as numbers; if so we generate
2432 // a custom collation key. Otherwise we pull out the value stored in the expansion table.
2433 if (m_collator_.m_isNumericCollation_){
2434 int leadingZeroIndex = 0;
2436 boolean nonZeroValReached = false;
2438 // clear and set initial string buffer length
2439 m_utilStringBuffer_.setLength(3);
2441 // We parse the source string until we hit a char that's NOT a digit
2442 // Use this u_charDigitValue. This might be slow because we have to
2443 // handle surrogates...
2445 if (UTF16.isTrailSurrogate(ch)) {
2446 if (!isBackwardsStart()){
2447 char lead = (char)previousChar();
2448 if (UTF16.isLeadSurrogate(lead)) {
2449 char32 = UCharacterProperty.getRawSupplementary(lead,
2457 int digVal = UCharacter.digit(char32);
2460 // Make sure we have enough space.
2461 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2462 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
2465 // Skipping over "trailing" zeroes but we still add to digIndx.
2466 if (digVal != 0 || nonZeroValReached) {
2467 if (digVal != 0 && !nonZeroValReached) {
2468 nonZeroValReached = true;
2471 // We parse the digit string into base 100 numbers (this
2472 // fits into a byte).
2473 // We only add to the buffer in twos, thus if we are
2474 // parsing an odd character, that serves as the 'tens'
2475 // digit while the if we are parsing an even one, that is
2476 // the 'ones' digit. We dumped the parsed base 100 value
2477 // (collateVal) into a buffer. We multiply each collateVal
2478 // by 2 (to give us room) and add 5 (to avoid overlapping
2479 // magic CE byte values). The last byte we subtract 1 to
2480 // ensure it is less than all the other bytes.
2481 // Since we're doing in this reverse we want to put the
2482 // first digit encountered into the ones place and the
2483 // second digit encountered into the tens place.
2485 if (digIndx % 2 == 1){
2486 collateVal += digVal * 10;
2488 // This removes leading zeroes.
2489 if (collateVal == 0 && leadingZeroIndex == 0) {
2490 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2492 else if (leadingZeroIndex != 0) {
2493 leadingZeroIndex = 0;
2496 m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
2497 (char)((collateVal << 1) + 6));
2501 collateVal = digVal;
2506 if (!isBackwardsStart()){
2507 backupInternalState(m_utilSpecialBackUp_);
2508 char32 = previousChar();
2509 if (UTF16.isTrailSurrogate(ch)){
2510 if (!isBackwardsStart()) {
2511 char lead = (char)previousChar();
2512 if (UTF16.isLeadSurrogate(lead)) {
2514 = UCharacterProperty.getRawSupplementary(
2518 updateInternalState(m_utilSpecialBackUp_);
2523 digVal = UCharacter.digit(char32);
2525 updateInternalState(m_utilSpecialBackUp_);
2534 if (nonZeroValReached == false) {
2536 m_utilStringBuffer_.setCharAt(2, (char)6);
2539 if (digIndx % 2 != 0) {
2540 if (collateVal == 0 && leadingZeroIndex == 0) {
2541 // This removes the leading 0 in a odd number sequence of
2542 // numbers e.g. avery001
2543 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2546 // this is not a leading 0, we add it in
2547 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2548 (char)((collateVal << 1) + 6));
2553 int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
2554 : ((digIndx >>> 1) + 2) ;
2555 digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
2556 // Subtract one off of the last byte.
2557 // Really the first byte here, but it's reversed...
2558 m_utilStringBuffer_.setCharAt(2,
2559 (char)(m_utilStringBuffer_.charAt(2) - 1));
2560 // We want to skip over the first two slots in the buffer.
2561 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2562 // The second slot is for the sign/exponent byte:
2563 // 0x80 + (decimalPos/2) & 7f.
2564 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2565 m_utilStringBuffer_.setCharAt(1,
2566 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2568 // Now transfer the collation key to our collIterate struct.
2569 // The total size for our collation key is endIndx bumped up to the
2570 // next largest even value divided by two.
2571 m_CEBufferSize_ = 0;
2572 m_CEBuffer_[m_CEBufferSize_ ++]
2573 = (((m_utilStringBuffer_.charAt(0) << 8)
2575 | m_utilStringBuffer_.charAt(1))
2576 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2578 | (RuleBasedCollator.BYTE_COMMON_
2579 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
2581 | RuleBasedCollator.BYTE_COMMON_;
2582 int i = endIndex - 1; // Reset the index into the buffer.
2584 int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
2586 primWeight |= m_utilStringBuffer_.charAt(i --);
2588 m_CEBuffer_[m_CEBufferSize_ ++]
2589 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2590 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
2592 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2593 return m_CEBuffer_[m_CEBufferOffset_];
2596 return collator.m_expansion_[getExpansionOffset(collator, ce)];
2601 * Returns previous hangul ces
2602 * @param collator current collator
2603 * @param ch current character
2604 * @return previous hangul ce
2606 private int previousHangul(RuleBasedCollator collator, char ch)
2608 char L = (char)(ch - HANGUL_SBASE_);
2609 // we do it in this order since some compilers can do % and / in one
2611 char T = (char)(L % HANGUL_TCOUNT_);
2612 L /= HANGUL_TCOUNT_;
2613 char V = (char)(L % HANGUL_VCOUNT_);
2614 L /= HANGUL_VCOUNT_;
2621 m_CEBufferSize_ = 0;
2622 if (!m_collator_.m_isJamoSpecial_) {
2623 m_CEBuffer_[m_CEBufferSize_ ++] =
2624 collator.m_trie_.getLeadValue(L);
2625 m_CEBuffer_[m_CEBufferSize_ ++] =
2626 collator.m_trie_.getLeadValue(V);
2627 if (T != HANGUL_TBASE_) {
2628 m_CEBuffer_[m_CEBufferSize_ ++] =
2629 collator.m_trie_.getLeadValue(T);
2631 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2632 return m_CEBuffer_[m_CEBufferOffset_];
2635 // Since Hanguls pass the FCD check, it is guaranteed that we won't
2636 // be in the normalization buffer if something like this happens
2637 // Move Jamos into normalization buffer
2638 m_buffer_.append(L);
2639 m_buffer_.append(V);
2640 if (T != HANGUL_TBASE_) {
2641 m_buffer_.append(T);
2643 m_bufferOffset_ = m_buffer_.length();
2644 m_FCDStart_ = m_source_.getIndex();
2645 m_FCDLimit_ = m_FCDStart_ + 1;
2651 * Gets implicit codepoint ces
2652 * @param codepoint current codepoint
2653 * @return implicit codepoint ces
2655 private int previousImplicit(int codepoint)
2657 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2658 m_CEBufferSize_ = 2;
2659 m_CEBufferOffset_ = 1;
2660 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2662 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2663 return m_CEBuffer_[1];
2667 * Gets the previous surrogate ce
2668 * @param ch current character
2669 * @return previous surrogate ce
2671 private int previousSurrogate(char ch)
2673 if (isBackwardsStart()) {
2674 // we are at the start of the string, wrong place to be at
2675 return CE_NOT_FOUND_;
2677 char prevch = (char)previousChar();
2678 // Handles Han and Supplementary characters here.
2679 if (UTF16.isLeadSurrogate(prevch)) {
2680 return previousImplicit(
2681 UCharacterProperty.getRawSupplementary(prevch, ch));
2683 if (prevch != CharacterIterator.DONE) {
2686 return CE_NOT_FOUND_; // treat like unassigned
2690 * <p>Special CE management. Expansions, contractions etc...</p>
2691 * @param collator can be plain UCA
2692 * @param ce current ce
2693 * @param ch current character
2694 * @return previous special ce
2696 private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
2699 // the only ces that loops are thai, special prefix and
2701 switch (RuleBasedCollator.getTag(ce)) {
2702 case CE_NOT_FOUND_TAG_: // this tag always returns
2704 case RuleBasedCollator.CE_SURROGATE_TAG_: // unpaired lead surrogate
2705 return CE_NOT_FOUND_;
2706 case CE_SPEC_PROC_TAG_:
2707 ce = previousSpecialPrefix(collator, ce);
2709 case CE_CONTRACTION_TAG_:
2710 // may loop for first character e.g. "0x0f71" for english
2711 if (isBackwardsStart()) {
2712 // start of string or this is not the end of any contraction
2713 ce = collator.m_contractionCE_[
2714 getContractionOffset(collator, ce)];
2717 return previousContraction(collator, ce, ch); // else
2718 case CE_LONG_PRIMARY_TAG_:
2719 return previousLongPrimary(ce);
2720 case CE_EXPANSION_TAG_: // always returns
2721 return previousExpansion(collator, ce);
2723 ce = previousDigit(collator, ce, ch);
2725 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2726 return previousHangul(collator, ch);
2727 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2728 return CE_NOT_FOUND_; // broken surrogate sequence, treat like unassigned
2729 case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2730 return previousSurrogate(ch);
2731 case CE_CJK_IMPLICIT_TAG_:
2732 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2733 return previousImplicit(ch);
2734 case CE_IMPLICIT_TAG_: // everything that is not defined
2735 // UCA is filled with these. Tailorings are NOT_FOUND
2736 return previousImplicit(ch);
2737 case CE_CHARSET_TAG_: // this tag always returns
2738 return CE_NOT_FOUND_;
2739 default: // this tag always returns
2742 if (!RuleBasedCollator.isSpecial(ce)) {
2750 // * Gets a character from the source string at a given offset.
2751 // * Handles both normal and iterative cases.
2752 // * No error checking and does not access the normalization buffer
2753 // * - caller beware!
2754 // * @param offset offset from current position which character is to be
2756 // * @return character at current position + offset
2758 // private char peekCharacter(int offset)
2760 // if (offset != 0) {
2761 // int currentoffset = m_source_.getIndex();
2762 // m_source_.setIndex(currentoffset + offset);
2763 // char result = (char)m_source_.current();
2764 // m_source_.setIndex(currentoffset);
2768 // return (char)m_source_.current();
2773 * Moves back 1 position in the source string. This is slightly less
2774 * complicated than previousChar in that it doesn't normalize while
2775 * moving back. Boundary checks are not performed.
2776 * This method is to be used with caution, with the assumption that
2777 * moving back one position will not exceed the source limits.
2778 * Use only with nextChar() and never call this API twice in a row without
2779 * nextChar() in the middle.
2781 private void goBackOne()
2783 if (m_bufferOffset_ >= 0) {
2787 m_source_.setIndex(m_source_.getIndex() - 1);
2792 * Moves forward 1 position in the source string. This is slightly less
2793 * complicated than nextChar in that it doesn't normalize while
2794 * moving back. Boundary checks are not performed.
2795 * This method is to be used with caution, with the assumption that
2796 * moving back one position will not exceed the source limits.
2797 * Use only with previousChar() and never call this API twice in a row
2798 * without previousChar() in the middle.
2800 private void goForwardOne()
2802 if (m_bufferOffset_ < 0) {
2803 // we're working on the source and not normalizing. fast path.
2804 // note Thai pre-vowel reordering uses buffer too
2805 m_source_.setIndex(m_source_.getIndex() + 1);
2808 // we are in the buffer, buffer offset will never be 0 here