2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 *******************************************************************************
\r
10 package com.ibm.icu.text;
\r
13 * import java.text.StringCharacterIterator;
\r
14 * import java.text.CharacterIterator;
\r
16 import com.ibm.icu.impl.NormalizerImpl;
\r
17 import com.ibm.icu.impl.UCharacterProperty;
\r
18 import com.ibm.icu.impl.StringUCharacterIterator;
\r
19 import com.ibm.icu.impl.CharacterIteratorWrapper;
\r
20 import com.ibm.icu.impl.ICUDebug;
\r
21 import com.ibm.icu.lang.UCharacter;
\r
22 import java.text.CharacterIterator;
\r
23 import java.util.MissingResourceException;
\r
26 * <p><code>CollationElementIterator</code> is an iterator created by
\r
27 * a RuleBasedCollator to walk through a string. The return result of
\r
28 * each iteration is a 32-bit collation element that defines the
\r
29 * ordering priority of the next character or sequence of characters
\r
30 * in the source string.</p>
\r
32 * <p>For illustration, consider the following in Spanish:
\r
35 * "ca" -> the first collation element is collation_element('c') and second
\r
36 * collation element is collation_element('a').
\r
38 * Since "ch" in Spanish sorts as one entity, the below example returns one
\r
39 * collation element for the two characters 'c' and 'h'
\r
41 * "cha" -> the first collation element is collation_element('ch') and second
\r
42 * collation element is collation_element('a').
\r
48 * Since the character 'æ' is a composed character of 'a' and 'e', the
\r
49 * iterator returns two collation elements for the single character 'æ'
\r
51 * "æb" -> the first collation element is collation_element('a'), the
\r
52 * second collation element is collation_element('e'), and the
\r
53 * third collation element is collation_element('b').
\r
58 * <p>For collation ordering comparison, the collation element results
\r
59 * can not be compared simply by using basic arithmetric operators,
\r
60 * e.g. <, == or >, further processing has to be done. Details
\r
61 * can be found in the ICU
\r
62 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
\r
63 * user guide</a>. An example of using the CollationElementIterator
\r
64 * for collation ordering comparison is the class
\r
65 * <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
\r
67 * <p>To construct a CollationElementIterator object, users
\r
68 * call the method getCollationElementIterator() on a
\r
69 * RuleBasedCollator that defines the desired sorting order.</p>
\r
74 * String testString = "This is a test";
\r
75 * RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
\r
76 * CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
\r
77 * int primaryOrder = iterator.IGNORABLE;
\r
78 * while (primaryOrder != iterator.NULLORDER) {
\r
79 * int order = iterator.next();
\r
80 * if (order != iterator.IGNORABLE &&
\r
81 * order != iterator.NULLORDER) {
\r
82 * // order is valid, not ignorable and we have not passed the end
\r
83 * // of the iteration, we do something
\r
84 * primaryOrder = CollationElementIterator.primaryOrder(order);
\r
85 * System.out.println("Next primary order 0x" +
\r
86 * Integer.toHexString(primaryOrder));
\r
93 * This class is not subclassable
\r
96 * @see RuleBasedCollator
\r
98 * @author Syn Wee Quek
\r
101 public final class CollationElementIterator
\r
105 // public data members --------------------------------------------------
\r
108 * <p>This constant is returned by the iterator in the methods
\r
109 * next() and previous() when the end or the beginning of the
\r
110 * source string has been reached, and there are no more valid
\r
111 * collation elements to return.</p>
\r
113 * <p>See class documentation for an example of use.</p>
\r
116 * @see #previous */
\r
117 public final static int NULLORDER = 0xffffffff;
\r
120 * <p>This constant is returned by the iterator in the methods
\r
121 * next() and previous() when a collation element result is to be
\r
124 * <p>See class documentation for an example of use.</p>
\r
127 * @see #previous */
\r
128 public static final int IGNORABLE = 0;
\r
130 // public methods -------------------------------------------------------
\r
132 // public getters -------------------------------------------------------
\r
135 * <p>Returns the character offset in the source string
\r
136 * corresponding to the next collation element. I.e., getOffset()
\r
137 * returns the position in the source string corresponding to the
\r
138 * collation element that will be returned by the next call to
\r
139 * next(). This value could be any of:
\r
141 * <li> The index of the <b>first</b> character corresponding to
\r
142 * the next collation element. (This means that if
\r
143 * <code>setOffset(offset)</code> sets the index in the middle of
\r
144 * a contraction, <code>getOffset()</code> returns the index of
\r
145 * the first character in the contraction, which may not be equal
\r
146 * to the original offset that was set. Hence calling getOffset()
\r
147 * immediately after setOffset(offset) does not guarantee that the
\r
148 * original offset set will be returned.)
\r
149 * <li> If normalization is on, the index of the <b>immediate</b>
\r
150 * subsequent character, or composite character with the first
\r
151 * character, having a combining class of 0.
\r
152 * <li> The length of the source string, if iteration has reached
\r
156 * @return The character offset in the source string corresponding to the
\r
157 * collation element that will be returned by the next call to
\r
161 public int getOffset()
\r
163 if (m_bufferOffset_ != -1) {
\r
164 if (m_isForwards_) {
\r
165 return m_FCDLimit_;
\r
167 return m_FCDStart_;
\r
169 return m_source_.getIndex();
\r
174 * <p> Returns the maximum length of any expansion sequence that ends with
\r
175 * the specified collation element. If there is no expansion with this
\r
176 * collation element as the last element, returns 1.
\r
178 * @param ce a collation element returned by previous() or next().
\r
179 * @return the maximum length of any expansion sequence ending
\r
180 * with the specified collation element.
\r
183 public int getMaxExpansion(int ce)
\r
186 int limit = m_collator_.m_expansionEndCE_.length;
\r
187 long unsignedce = ce & 0xFFFFFFFFl;
\r
188 while (start < limit - 1) {
\r
189 int mid = start + ((limit - start) >> 1);
\r
190 long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
\r
191 if (unsignedce <= midce) {
\r
199 if (m_collator_.m_expansionEndCE_[start] == ce) {
\r
200 result = m_collator_.m_expansionEndCEMaxSize_[start];
\r
202 else if (limit < m_collator_.m_expansionEndCE_.length &&
\r
203 m_collator_.m_expansionEndCE_[limit] == ce) {
\r
204 result = m_collator_.m_expansionEndCEMaxSize_[limit];
\r
206 else if ((ce & 0xFFFF) == 0x00C0) {
\r
212 // public other methods -------------------------------------------------
\r
215 * <p> Resets the cursor to the beginning of the string. The next
\r
216 * call to next() or previous() will return the first and last
\r
217 * collation element in the string, respectively.</p>
\r
219 * <p>If the RuleBasedCollator used by this iterator has had its
\r
220 * attributes changed, calling reset() will reinitialize the
\r
221 * iterator to use the new attributes.</p>
\r
225 public void reset()
\r
227 m_source_.setToStart();
\r
228 updateInternalState();
\r
232 * <p>Get the next collation element in the source string.</p>
\r
234 * <p>This iterator iterates over a sequence of collation elements
\r
235 * that were built from the string. Because there isn't
\r
236 * necessarily a one-to-one mapping from characters to collation
\r
237 * elements, this doesn't mean the same thing as "return the
\r
238 * collation element [or ordering priority] of the next character
\r
239 * in the string".</p>
\r
241 * <p>This function returns the collation element that the
\r
242 * iterator is currently pointing to, and then updates the
\r
243 * internal pointer to point to the next element. Previous()
\r
244 * updates the pointer first, and then returns the element. This
\r
245 * means that when you change direction while iterating (i.e.,
\r
246 * call next() and then call previous(), or call previous() and
\r
247 * then call next()), you'll get back the same element twice.</p>
\r
249 * @return the next collation element or NULLORDER if the end of the
\r
250 * iteration has been reached.
\r
255 m_isForwards_ = true;
\r
256 if (m_CEBufferSize_ > 0) {
\r
257 if (m_CEBufferOffset_ < m_CEBufferSize_) {
\r
258 // if there are expansions left in the buffer, we return it
\r
259 return m_CEBuffer_[m_CEBufferOffset_ ++];
\r
261 m_CEBufferSize_ = 0;
\r
262 m_CEBufferOffset_ = 0;
\r
265 int ch_int = nextChar();
\r
267 if (ch_int == UCharacterIterator.DONE) {
\r
270 char ch = (char)ch_int;
\r
271 if (m_collator_.m_isHiragana4_) {
\r
272 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
\r
273 * based on whether the previous codepoint was Hiragana or Katakana.
\r
275 m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
\r
276 ((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
\r
279 int result = NULLORDER;
\r
281 // For latin-1 characters we never need to fall back to the UCA
\r
282 // table because all of the UCA data is replicated in the
\r
283 // latinOneMapping array
\r
284 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
\r
285 if (RuleBasedCollator.isSpecial(result)) {
\r
286 result = nextSpecial(m_collator_, result, ch);
\r
290 result = m_collator_.m_trie_.getLeadValue(ch);
\r
291 //System.out.println(Integer.toHexString(result));
\r
292 if (RuleBasedCollator.isSpecial(result)) {
\r
293 // surrogate leads are handled as special ces
\r
294 result = nextSpecial(m_collator_, result, ch);
\r
296 if (result == CE_NOT_FOUND_ && RuleBasedCollator.UCA_ != null) {
\r
297 // couldn't find a good CE in the tailoring
\r
298 // if we got here, the codepoint MUST be over 0xFF - so we look
\r
299 // directly in the UCA
\r
300 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
\r
301 if (RuleBasedCollator.isSpecial(result)) {
\r
302 // UCA also gives us a special CE
\r
303 result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
\r
307 if(result == CE_NOT_FOUND_) {
\r
308 // maybe there is no UCA, unlikely in Java, but ported for consistency
\r
309 result = nextImplicit(ch);
\r
315 * <p>Get the previous collation element in the source string.</p>
\r
317 * <p>This iterator iterates over a sequence of collation elements
\r
318 * that were built from the string. Because there isn't
\r
319 * necessarily a one-to-one mapping from characters to collation
\r
320 * elements, this doesn't mean the same thing as "return the
\r
321 * collation element [or ordering priority] of the previous
\r
322 * character in the string".</p>
\r
324 * <p>This function updates the iterator's internal pointer to
\r
325 * point to the collation element preceding the one it's currently
\r
326 * pointing to and then returns that element, while next() returns
\r
327 * the current element and then updates the pointer. This means
\r
328 * that when you change direction while iterating (i.e., call
\r
329 * next() and then call previous(), or call previous() and then
\r
330 * call next()), you'll get back the same element twice.</p>
\r
332 * @return the previous collation element, or NULLORDER when the start of
\r
333 * the iteration has been reached.
\r
336 public int previous()
\r
338 if (m_source_.getIndex() <= 0 && m_isForwards_) {
\r
339 // if iterator is new or reset, we can immediate perform backwards
\r
340 // iteration even when the offset is not right.
\r
341 m_source_.setToLimit();
\r
342 updateInternalState();
\r
344 m_isForwards_ = false;
\r
345 int result = NULLORDER;
\r
346 if (m_CEBufferSize_ > 0) {
\r
347 if (m_CEBufferOffset_ > 0) {
\r
348 return m_CEBuffer_[-- m_CEBufferOffset_];
\r
350 m_CEBufferSize_ = 0;
\r
351 m_CEBufferOffset_ = 0;
\r
353 int ch_int = previousChar();
\r
354 if (ch_int == UCharacterIterator.DONE) {
\r
357 char ch = (char)ch_int;
\r
358 if (m_collator_.m_isHiragana4_) {
\r
359 m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
\r
361 if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
\r
362 result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
\r
366 result = m_collator_.m_trie_.getLatin1LinearValue(ch);
\r
369 result = m_collator_.m_trie_.getLeadValue(ch);
\r
371 if (RuleBasedCollator.isSpecial(result)) {
\r
372 result = previousSpecial(m_collator_, result, ch);
\r
374 if (result == CE_NOT_FOUND_) {
\r
375 if (!isBackwardsStart()
\r
376 && m_collator_.isContractionEnd(ch)) {
\r
377 result = CE_CONTRACTION_;
\r
380 if(RuleBasedCollator.UCA_ != null) {
\r
381 result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
\r
385 if (RuleBasedCollator.isSpecial(result)) {
\r
386 if(RuleBasedCollator.UCA_ != null) {
\r
387 result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
\r
392 if(result == CE_NOT_FOUND_) {
\r
393 result = previousImplicit(ch);
\r
399 * Return the primary order of the specified collation element,
\r
400 * i.e. the first 16 bits. This value is unsigned.
\r
401 * @param ce the collation element
\r
402 * @return the element's 16 bits primary order.
\r
405 public final static int primaryOrder(int ce)
\r
407 return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
\r
408 >>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
\r
411 * Return the secondary order of the specified collation element,
\r
412 * i.e. the 16th to 23th bits, inclusive. This value is unsigned.
\r
413 * @param ce the collation element
\r
414 * @return the element's 8 bits secondary order
\r
417 public final static int secondaryOrder(int ce)
\r
419 return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
\r
420 >> RuleBasedCollator.CE_SECONDARY_SHIFT_;
\r
424 * Return the tertiary order of the specified collation element, i.e. the last
\r
425 * 8 bits. This value is unsigned.
\r
426 * @param ce the collation element
\r
427 * @return the element's 8 bits tertiary order
\r
430 public final static int tertiaryOrder(int ce)
\r
432 return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
\r
436 * <p> Sets the iterator to point to the collation element
\r
437 * corresponding to the character at the specified offset. The
\r
438 * value returned by the next call to next() will be the collation
\r
439 * element corresponding to the characters at offset.</p>
\r
441 * <p>If offset is in the middle of a contracting character
\r
442 * sequence, the iterator is adjusted to the start of the
\r
443 * contracting sequence. This means that getOffset() is not
\r
444 * guaranteed to return the same value set by this method.</p>
\r
446 * <p>If the decomposition mode is on, and offset is in the middle
\r
447 * of a decomposible range of source text, the iterator may not
\r
448 * return a correct result for the next forwards or backwards
\r
449 * iteration. The user must ensure that the offset is not in the
\r
450 * middle of a decomposible range.</p>
\r
452 * @param offset the character offset into the original source string to
\r
453 * set. Note that this is not an offset into the corresponding
\r
454 * sequence of collation elements.
\r
457 public void setOffset(int offset)
\r
459 m_source_.setIndex(offset);
\r
460 int ch_int = m_source_.current();
\r
461 char ch = (char)ch_int;
\r
462 if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
\r
463 // if it is unsafe we need to check if it is part of a contraction
\r
464 // or a surrogate character
\r
465 if (UTF16.isTrailSurrogate(ch)) {
\r
466 // if it is a surrogate pair we move up one character
\r
467 char prevch = (char)m_source_.previous();
\r
468 if (!UTF16.isLeadSurrogate(prevch)) {
\r
469 m_source_.setIndex(offset); // go back to the same index
\r
473 // could be part of a contraction
\r
474 // backup to a safe point and iterate till we pass offset
\r
475 while (m_source_.getIndex() > 0) {
\r
476 if (!m_collator_.isUnsafe(ch)) {
\r
479 ch = (char)m_source_.previous();
\r
481 updateInternalState();
\r
482 int prevoffset = 0;
\r
483 while (m_source_.getIndex() <= offset) {
\r
484 prevoffset = m_source_.getIndex();
\r
487 m_source_.setIndex(prevoffset);
\r
490 updateInternalState();
\r
491 // direction code to prevent next and previous from returning a
\r
492 // character if we are already at the ends
\r
493 offset = m_source_.getIndex();
\r
494 if (offset == 0/* m_source_.getBeginIndex() */) {
\r
495 // preventing previous() from returning characters from the end of
\r
496 // the string again if we are at the beginning
\r
497 m_isForwards_ = false;
\r
499 else if (offset == m_source_.getLength()) {
\r
500 // preventing next() from returning characters from the start of
\r
501 // the string again if we are at the end
\r
502 m_isForwards_ = true;
\r
507 * <p>Set a new source string for iteration, and reset the offset
\r
508 * to the beginning of the text.</p>
\r
510 * @param source the new source string for iteration.
\r
513 public void setText(String source)
\r
515 m_srcUtilIter_.setText(source);
\r
516 m_source_ = m_srcUtilIter_;
\r
517 updateInternalState();
\r
521 * <p>Set a new source string iterator for iteration, and reset the
\r
522 * offset to the beginning of the text.
\r
524 * <p>The source iterator's integrity will be preserved since a new copy
\r
525 * will be created for use.</p>
\r
526 * @param source the new source string iterator for iteration.
\r
529 public void setText(UCharacterIterator source)
\r
531 m_srcUtilIter_.setText(source.getText());
\r
532 m_source_ = m_srcUtilIter_;
\r
533 updateInternalState();
\r
537 * <p>Set a new source string iterator for iteration, and reset the
\r
538 * offset to the beginning of the text.
\r
540 * @param source the new source string iterator for iteration.
\r
543 public void setText(CharacterIterator source)
\r
545 m_source_ = new CharacterIteratorWrapper(source);
\r
546 m_source_.setToStart();
\r
547 updateInternalState();
\r
550 // public miscellaneous methods -----------------------------------------
\r
553 * Tests that argument object is equals to this CollationElementIterator.
\r
554 * Iterators are equal if the objects uses the same RuleBasedCollator,
\r
555 * the same source text and have the same current position in iteration.
\r
556 * @param that object to test if it is equals to this
\r
557 * CollationElementIterator
\r
560 public boolean equals(Object that)
\r
562 if (that == this) {
\r
565 if (that instanceof CollationElementIterator) {
\r
566 CollationElementIterator thatceiter
\r
567 = (CollationElementIterator)that;
\r
568 if (!m_collator_.equals(thatceiter.m_collator_)) {
\r
571 // checks the text
\r
572 return m_source_.getIndex() == thatceiter.m_source_.getIndex()
\r
573 && m_source_.getText().equals(
\r
574 thatceiter.m_source_.getText());
\r
579 // package private constructors ------------------------------------------
\r
582 * <p>CollationElementIterator constructor. This takes a source
\r
583 * string and a RuleBasedCollator. The iterator will walk through
\r
584 * the source string based on the rules defined by the
\r
585 * collator. If the source string is empty, NULLORDER will be
\r
586 * returned on the first call to next().</p>
\r
588 * @param source the source string.
\r
589 * @param collator the RuleBasedCollator
\r
592 CollationElementIterator(String source, RuleBasedCollator collator)
\r
594 m_srcUtilIter_ = new StringUCharacterIterator(source);
\r
595 m_utilStringBuffer_ = new StringBuffer();
\r
596 m_source_ = m_srcUtilIter_;
\r
597 m_collator_ = collator;
\r
598 m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
\r
599 m_buffer_ = new StringBuffer();
\r
600 m_utilSpecialBackUp_ = new Backup();
\r
601 updateInternalState();
\r
605 * <p>CollationElementIterator constructor. This takes a source
\r
606 * character iterator and a RuleBasedCollator. The iterator will
\r
607 * walk through the source string based on the rules defined by
\r
608 * the collator. If the source string is empty, NULLORDER will be
\r
609 * returned on the first call to next().</p>
\r
611 * @param source the source string iterator.
\r
612 * @param collator the RuleBasedCollator
\r
615 CollationElementIterator(CharacterIterator source,
\r
616 RuleBasedCollator collator)
\r
618 m_srcUtilIter_ = new StringUCharacterIterator();
\r
619 m_utilStringBuffer_ = new StringBuffer();
\r
620 m_source_ = new CharacterIteratorWrapper(source);
\r
621 m_collator_ = collator;
\r
622 m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
\r
623 m_buffer_ = new StringBuffer();
\r
624 m_utilSpecialBackUp_ = new Backup();
\r
625 updateInternalState();
\r
629 * <p>CollationElementIterator constructor. This takes a source
\r
630 * character iterator and a RuleBasedCollator. The iterator will
\r
631 * walk through the source string based on the rules defined by
\r
632 * the collator. If the source string is empty, NULLORDER will be
\r
633 * returned on the first call to next().</p>
\r
635 * @param source the source string iterator.
\r
636 * @param collator the RuleBasedCollator
\r
639 CollationElementIterator(UCharacterIterator source,
\r
640 RuleBasedCollator collator)
\r
642 m_srcUtilIter_ = new StringUCharacterIterator();
\r
643 m_utilStringBuffer_ = new StringBuffer();
\r
644 m_srcUtilIter_.setText(source.getText());
\r
645 m_source_ = m_srcUtilIter_;
\r
646 m_collator_ = collator;
\r
647 m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
\r
648 m_buffer_ = new StringBuffer();
\r
649 m_utilSpecialBackUp_ = new Backup();
\r
650 updateInternalState();
\r
653 // package private data members -----------------------------------------
\r
656 * true if current codepoint was Hiragana
\r
658 boolean m_isCodePointHiragana_;
\r
660 * Position in the original string that starts with a non-FCD sequence
\r
664 * This is the CE from CEs buffer that should be returned.
\r
665 * Initial value is 0.
\r
666 * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
\r
667 * backwards will end with m_CEBufferOffset_ == 0.
\r
668 * The next/previous after we reach the end/beginning of the m_CEBuffer_
\r
669 * will cause this value to be reset to 0.
\r
671 int m_CEBufferOffset_;
\r
674 * This is the position to which we have stored processed CEs.
\r
675 * Initial value is 0.
\r
676 * The next/previous after we reach the end/beginning of the m_CEBuffer_
\r
677 * will cause this value to be reset to 0.
\r
679 int m_CEBufferSize_;
\r
680 static final int CE_NOT_FOUND_ = 0xF0000000;
\r
681 static final int CE_EXPANSION_TAG_ = 1;
\r
682 static final int CE_CONTRACTION_TAG_ = 2;
\r
684 * Collate Digits As Numbers (CODAN) implementation
\r
686 static final int CE_DIGIT_TAG_ = 13;
\r
688 // package private methods ----------------------------------------------
\r
691 * Sets the collator used.
\r
692 * Internal use, all data members will be reset to the default values
\r
693 * @param collator to set
\r
695 void setCollator(RuleBasedCollator collator)
\r
697 m_collator_ = collator;
\r
698 updateInternalState();
\r
702 * <p>Sets the iterator to point to the collation element corresponding to
\r
703 * the specified character (the parameter is a CHARACTER offset in the
\r
704 * original string, not an offset into its corresponding sequence of
\r
705 * collation elements). The value returned by the next call to next()
\r
706 * will be the collation element corresponding to the specified position
\r
707 * in the text. Unlike the public method setOffset(int), this method does
\r
708 * not try to readjust the offset to the start of a contracting sequence.
\r
709 * getOffset() is guaranteed to return the same value as was passed to a
\r
710 * preceding call to setOffset().</p>
\r
711 * @param offset new character offset into the original text to set.
\r
713 void setExactOffset(int offset)
\r
715 m_source_.setIndex(offset);
\r
716 updateInternalState();
\r
720 * Checks if iterator is in the buffer zone
\r
721 * @return true if iterator is in buffer zone, false otherwise
\r
723 boolean isInBuffer()
\r
725 return m_bufferOffset_ > 0;
\r
730 * <p>Sets the iterator to point to the collation element corresponding to
\r
731 * the specified character (the parameter is a CHARACTER offset in the
\r
732 * original string, not an offset into its corresponding sequence of
\r
733 * collation elements). The value returned by the next call to next()
\r
734 * will be the collation element corresponding to the specified position
\r
735 * in the text. Unlike the public method setOffset(int), this method does
\r
736 * not try to readjust the offset to the start of a contracting sequence.
\r
737 * getOffset() is guaranteed to return the same value as was passed to a
\r
738 * preceding call to setOffset().</p>
\r
740 * @param source the new source string iterator for iteration.
\r
741 * @param offset to the source
\r
743 void setText(UCharacterIterator source, int offset)
\r
745 m_srcUtilIter_.setText(source.getText());
\r
746 m_source_ = m_srcUtilIter_;
\r
747 m_source_.setIndex(offset);
\r
748 updateInternalState();
\r
751 // private inner class --------------------------------------------------
\r
754 * Backup data class
\r
756 private static final class Backup
\r
758 // protected data members -------------------------------------------
\r
761 * Backup non FCD sequence limit
\r
763 protected int m_FCDLimit_;
\r
765 * Backup non FCD sequence start
\r
767 protected int m_FCDStart_;
\r
769 * Backup if previous Codepoint is Hiragana quatenary
\r
771 protected boolean m_isCodePointHiragana_;
\r
773 * Backup buffer position
\r
775 protected int m_bufferOffset_;
\r
777 * Backup source iterator offset
\r
779 protected int m_offset_;
\r
781 * Backup buffer contents
\r
783 protected StringBuffer m_buffer_;
\r
785 // protected constructor --------------------------------------------
\r
788 * Empty constructor
\r
792 m_buffer_ = new StringBuffer();
\r
795 // end inner class ------------------------------------------------------
\r
798 * Direction of travel
\r
800 private boolean m_isForwards_;
\r
802 * Source string iterator
\r
804 private UCharacterIterator m_source_;
\r
806 * This is position to the m_buffer_, -1 if iterator is not in m_buffer_
\r
808 private int m_bufferOffset_;
\r
810 * Buffer for temporary storage of normalized characters, discontiguous
\r
811 * characters and Thai characters
\r
813 private StringBuffer m_buffer_;
\r
815 * Position in the original string to continue forward FCD check from.
\r
817 private int m_FCDLimit_;
\r
819 * The collator this iterator is based on
\r
821 private RuleBasedCollator m_collator_;
\r
823 * true if Hiragana quatenary is on
\r
825 //private boolean m_isHiragana4_;
\r
829 private int m_CEBuffer_[];
\r
831 * In reality we should not have to deal with expansion sequences longer
\r
832 * then 16. However this value can be change if a bigger buffer is needed.
\r
833 * Note, if the size is change to too small a number, BIG trouble.
\r
834 * Reasonable small value is around 10, if there's no Arabic or other
\r
835 * funky collations that have long expansion sequence. This is the longest
\r
836 * expansion sequence this can handle without bombing out.
\r
838 private static final int CE_BUFFER_INIT_SIZE_ = 512;
\r
840 * Backup storage for special processing inner cases
\r
842 private Backup m_utilSpecialBackUp_;
\r
844 * Backup storage in special processing entry state
\r
846 private Backup m_utilSpecialEntryBackUp_;
\r
848 * Backup storage in special processing discontiguous state
\r
850 private Backup m_utilSpecialDiscontiguousBackUp_;
\r
854 private StringUCharacterIterator m_srcUtilIter_;
\r
855 private StringBuffer m_utilStringBuffer_;
\r
856 private StringBuffer m_utilSkippedBuffer_;
\r
857 private CollationElementIterator m_utilColEIter_;
\r
859 * One character before the first non-zero combining class character
\r
861 private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
\r
863 * One character before the first character with leading non-zero combining
\r
866 private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
\r
868 * Mask for the last byte
\r
870 private static final int LAST_BYTE_MASK_ = 0xFF;
\r
872 * Shift value for the second last byte
\r
874 private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
\r
876 // special ce values and tags -------------------------------------------
\r
878 // private static final int CE_EXPANSION_ = 0xF1000000;
\r
879 private static final int CE_CONTRACTION_ = 0xF2000000;
\r
881 * Indicates the last ce has been consumed. Compare with NULLORDER.
\r
882 * NULLORDER is returned if error occurs.
\r
884 /* private static final int CE_NO_MORE_CES_ = 0x00010101;
\r
885 private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
\r
886 private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
\r
887 private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
\r
889 private static final int CE_NOT_FOUND_TAG_ = 0;
\r
891 * Charset processing, not yet implemented
\r
893 private static final int CE_CHARSET_TAG_ = 4;
\r
897 private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
\r
901 private static final int CE_LEAD_SURROGATE_TAG_ = 7;
\r
905 private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
\r
907 * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
\r
909 private static final int CE_CJK_IMPLICIT_TAG_ = 9;
\r
910 private static final int CE_IMPLICIT_TAG_ = 10;
\r
911 static final int CE_SPEC_PROC_TAG_ = 11;
\r
913 * This is a 3 byte primary with starting secondaries and tertiaries.
\r
914 * It fits in a single 32 bit CE and is used instead of expansion to save
\r
915 * space without affecting the performance (hopefully).
\r
917 private static final int CE_LONG_PRIMARY_TAG_ = 12;
\r
919 // private static final int CE_CE_TAGS_COUNT = 14;
\r
920 private static final int CE_BYTE_COMMON_ = 0x05;
\r
922 // end special ce values and tags ---------------------------------------
\r
924 private static final int HANGUL_SBASE_ = 0xAC00;
\r
925 private static final int HANGUL_LBASE_ = 0x1100;
\r
926 private static final int HANGUL_VBASE_ = 0x1161;
\r
927 private static final int HANGUL_TBASE_ = 0x11A7;
\r
928 private static final int HANGUL_VCOUNT_ = 21;
\r
929 private static final int HANGUL_TCOUNT_ = 28;
\r
931 // CJK stuff ------------------------------------------------------------
\r
933 /* private static final int CJK_BASE_ = 0x4E00;
\r
934 private static final int CJK_LIMIT_ = 0x9FFF+1;
\r
935 private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
\r
936 private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
\r
937 private static final int CJK_A_BASE_ = 0x3400;
\r
938 private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
\r
939 private static final int CJK_B_BASE_ = 0x20000;
\r
940 private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
\r
941 private static final int NON_CJK_OFFSET_ = 0x110000;
\r
943 private static final boolean DEBUG = ICUDebug.enabled("collator");
\r
945 // private methods ------------------------------------------------------
\r
948 * Reset the iterator internally
\r
950 private void updateInternalState()
\r
952 m_isCodePointHiragana_ = false;
\r
953 m_buffer_.setLength(0);
\r
954 m_bufferOffset_ = -1;
\r
955 m_CEBufferOffset_ = 0;
\r
956 m_CEBufferSize_ = 0;
\r
958 m_FCDStart_ = m_source_.getLength();
\r
959 //m_isHiragana4_ = m_collator_.m_isHiragana4_;
\r
960 m_isForwards_ = true;
\r
964 * Backup the current internal state
\r
965 * @param backup object to store the data
\r
967 private void backupInternalState(Backup backup)
\r
969 backup.m_offset_ = m_source_.getIndex();
\r
970 backup.m_FCDLimit_ = m_FCDLimit_;
\r
971 backup.m_FCDStart_ = m_FCDStart_;
\r
972 backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
\r
973 backup.m_bufferOffset_ = m_bufferOffset_;
\r
974 backup.m_buffer_.setLength(0);
\r
975 if (m_bufferOffset_ >= 0) {
\r
976 // jdk 1.3.1 does not have append(StringBuffer) yet
\r
977 if(ICUDebug.isJDK14OrHigher){
\r
978 backup.m_buffer_.append(m_buffer_);
\r
980 backup.m_buffer_.append(m_buffer_.toString());
\r
986 * Update the iterator internally with backed-up state
\r
987 * @param backup object that stored the data
\r
989 private void updateInternalState(Backup backup)
\r
991 m_source_.setIndex(backup.m_offset_);
\r
992 m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
\r
993 m_bufferOffset_ = backup.m_bufferOffset_;
\r
994 m_FCDLimit_ = backup.m_FCDLimit_;
\r
995 m_FCDStart_ = backup.m_FCDStart_;
\r
996 m_buffer_.setLength(0);
\r
997 if (m_bufferOffset_ >= 0) {
\r
998 // jdk 1.3.1 does not have append(StringBuffer) yet
\r
999 m_buffer_.append(backup.m_buffer_.toString());
\r
1004 * A fast combining class retrieval system.
\r
1005 * @param ch UTF16 character
\r
1006 * @return combining class of ch
\r
1008 private int getCombiningClass(int ch)
\r
1010 if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
\r
1011 m_collator_.isUnsafe((char)ch) || ch > 0xFFFF) {
\r
1012 return NormalizerImpl.getCombiningClass(ch);
\r
1018 * <p>Incremental normalization, this is an essential optimization.
\r
1019 * Assuming FCD checks has been done, normalize the non-FCD characters into
\r
1021 * Source offsets points to the current processing character.
\r
1024 private void normalize()
\r
1026 int size = m_FCDLimit_ - m_FCDStart_;
\r
1027 m_buffer_.setLength(0);
\r
1028 m_source_.setIndex(m_FCDStart_);
\r
1029 for (int i = 0; i < size; i ++) {
\r
1030 m_buffer_.append((char)m_source_.next());
\r
1032 String decomp = Normalizer.decompose(m_buffer_.toString(), false);
\r
1033 m_buffer_.setLength(0);
\r
1034 m_buffer_.append(decomp);
\r
1035 m_bufferOffset_ = 0;
\r
1039 * <p>Incremental FCD check and normalization. Gets the next base character
\r
1040 * position and determines if the in-between characters needs normalization.
\r
1042 * <p>When entering, the state is known to be this:
\r
1044 * <li>We are working on source string, not the buffer.
\r
1045 * <li>The leading combining class from the current character is 0 or the
\r
1046 * trailing combining class of the previous char was zero.
\r
1048 * Incoming source offsets points to the current processing character.
\r
1049 * Return source offsets points to the current processing character.
\r
1051 * @param ch current character
\r
1052 * @param offset current character offset
\r
1053 * @return true if FCDCheck passes, false otherwise
\r
1055 private boolean FCDCheck(char ch, int offset)
\r
1057 boolean result = true;
\r
1059 // Get the trailing combining class of the current character.
\r
1060 // If it's zero, we are OK.
\r
1061 m_FCDStart_ = offset;
\r
1062 m_source_.setIndex(offset);
\r
1064 char fcd = NormalizerImpl.getFCD16(ch);
\r
1065 if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
\r
1067 ch = (char)m_source_.current();
\r
1068 // UCharacterIterator.DONE has 0 fcd
\r
1069 if (UTF16.isTrailSurrogate(ch)) {
\r
1070 fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
\r
1076 int prevTrailCC = fcd & LAST_BYTE_MASK_;
\r
1078 if (prevTrailCC != 0) {
\r
1079 // The current char has a non-zero trailing CC. Scan forward until
\r
1080 // we find a char with a leading cc of zero.
\r
1083 int ch_int = m_source_.current();
\r
1084 if (ch_int == UCharacterIterator.DONE) {
\r
1087 ch = (char)ch_int;
\r
1089 fcd = NormalizerImpl.getFCD16(ch);
\r
1090 if (fcd != 0 && UTF16.isLeadSurrogate(ch)) {
\r
1092 ch = (char)m_source_.current();
\r
1093 if (UTF16.isTrailSurrogate(ch)) {
\r
1094 fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch);
\r
1099 int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
\r
1100 if (leadCC == 0) {
\r
1101 // this is a base character, we stop the FCD checks
\r
1105 if (leadCC < prevTrailCC) {
\r
1109 prevTrailCC = fcd & LAST_BYTE_MASK_;
\r
1112 m_FCDLimit_ = m_source_.getIndex();
\r
1113 m_source_.setIndex(m_FCDStart_);
\r
1119 * <p>Method tries to fetch the next character that is in fcd form.</p>
\r
1120 * <p>Normalization is done if required.</p>
\r
1121 * <p>Offsets are returned at the next character.</p>
\r
1122 * @return next fcd character
\r
1124 private int nextChar()
\r
1128 // loop handles the next character whether it is in the buffer or not.
\r
1129 if (m_bufferOffset_ < 0) {
\r
1130 // we're working on the source and not normalizing. fast path.
\r
1131 // note Thai pre-vowel reordering uses buffer too
\r
1132 result = m_source_.current();
\r
1135 // we are in the buffer, buffer offset will never be 0 here
\r
1136 if (m_bufferOffset_ >= m_buffer_.length()) {
\r
1137 // Null marked end of buffer, revert to the source string and
\r
1138 // loop back to top to try again to get a character.
\r
1139 m_source_.setIndex(m_FCDLimit_);
\r
1140 m_bufferOffset_ = -1;
\r
1141 m_buffer_.setLength(0);
\r
1142 return nextChar();
\r
1144 return m_buffer_.charAt(m_bufferOffset_ ++);
\r
1146 int startoffset = m_source_.getIndex();
\r
1147 if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
\r
1148 // Fast fcd safe path. trail combining class == 0.
\r
1149 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
\r
1150 || m_bufferOffset_ >= 0 || m_FCDLimit_ > startoffset) {
\r
1151 // skip the fcd checks
\r
1156 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
\r
1157 // We need to peek at the next character in order to tell if we are
\r
1160 int next = m_source_.current();
\r
1161 if (next == UCharacterIterator.DONE
\r
1162 || next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
\r
1163 return result; // end of source string and if next character
\r
1164 // starts with a base character is always fcd.
\r
1168 // Need a more complete FCD check and possible normalization.
\r
1169 if (!FCDCheck((char)result, startoffset)) {
\r
1171 result = m_buffer_.charAt(0);
\r
1172 m_bufferOffset_ = 1;
\r
1178 * <p>Incremental normalization, this is an essential optimization.
\r
1179 * Assuming FCD checks has been done, normalize the non-FCD characters into
\r
1181 * Source offsets points to the current processing character.</p>
\r
1183 private void normalizeBackwards()
\r
1186 m_bufferOffset_ = m_buffer_.length();
\r
1190 * <p>Incremental backwards FCD check and normalization. Gets the previous
\r
1191 * base character position and determines if the in-between characters
\r
1192 * needs normalization.
\r
1194 * <p>When entering, the state is known to be this:
\r
1196 * <li>We are working on source string, not the buffer.
\r
1197 * <li>The trailing combining class from the current character is 0 or the
\r
1198 * leading combining class of the next char was zero.
\r
1200 * Input source offsets points to the previous character.
\r
1201 * Return source offsets points to the current processing character.
\r
1203 * @param ch current character
\r
1204 * @param offset current character offset
\r
1205 * @return true if FCDCheck passes, false otherwise
\r
1207 private boolean FCDCheckBackwards(char ch, int offset)
\r
1209 boolean result = true;
\r
1211 m_FCDLimit_ = offset + 1;
\r
1212 m_source_.setIndex(offset);
\r
1213 if (!UTF16.isSurrogate(ch)) {
\r
1214 fcd = NormalizerImpl.getFCD16(ch);
\r
1216 else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) {
\r
1217 // note trail surrogate characters gets 0 fcd
\r
1218 char trailch = ch;
\r
1219 ch = (char)m_source_.previous();
\r
1220 if (UTF16.isLeadSurrogate(ch)) {
\r
1221 fcd = NormalizerImpl.getFCD16(ch);
\r
1223 fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd,
\r
1228 fcd = 0; // unpaired surrogate
\r
1232 int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
\r
1233 // The current char has a non-zero leading combining class.
\r
1234 // Scan backward until we find a char with a trailing cc of zero.
\r
1236 while (leadCC != 0) {
\r
1237 offset = m_source_.getIndex();
\r
1238 if (offset == 0) {
\r
1241 ch = (char)m_source_.previous();
\r
1242 if (!UTF16.isSurrogate(ch)) {
\r
1243 fcd = NormalizerImpl.getFCD16(ch);
\r
1245 else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) {
\r
1247 ch = (char)m_source_.previous();
\r
1248 if (UTF16.isLeadSurrogate(ch)) {
\r
1249 fcd = NormalizerImpl.getFCD16(ch);
\r
1252 fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail);
\r
1256 fcd = 0; // unpaired surrogate
\r
1258 int prevTrailCC = fcd & LAST_BYTE_MASK_;
\r
1259 if (leadCC < prevTrailCC) {
\r
1262 leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_;
\r
1265 // storing character with 0 lead fcd or the 1st accent with a base
\r
1266 // character before it
\r
1268 m_FCDStart_ = offset;
\r
1271 m_FCDStart_ = m_source_.getIndex();
\r
1273 m_source_.setIndex(m_FCDLimit_);
\r
1278 * <p>Method tries to fetch the previous character that is in fcd form.</p>
\r
1279 * <p>Normalization is done if required.</p>
\r
1280 * <p>Offsets are returned at the current character.</p>
\r
1281 * @return previous fcd character
\r
1283 private int previousChar()
\r
1285 if (m_bufferOffset_ >= 0) {
\r
1286 m_bufferOffset_ --;
\r
1287 if (m_bufferOffset_ >= 0) {
\r
1288 return m_buffer_.charAt(m_bufferOffset_);
\r
1291 // At the start of buffer, route back to string.
\r
1292 m_buffer_.setLength(0);
\r
1293 if (m_FCDStart_ == 0) {
\r
1295 m_source_.setIndex(0);
\r
1296 return UCharacterIterator.DONE;
\r
1299 m_FCDLimit_ = m_FCDStart_;
\r
1300 m_source_.setIndex(m_FCDStart_);
\r
1301 return previousChar();
\r
1305 int result = m_source_.previous();
\r
1306 int startoffset = m_source_.getIndex();
\r
1307 if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
\r
1308 || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
\r
1309 || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
\r
1312 int ch = m_source_.previous();
\r
1313 if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
\r
1314 // if previous character is FCD
\r
1318 // Need a more complete FCD check and possible normalization.
\r
1319 if (!FCDCheckBackwards((char)result, startoffset)) {
\r
1320 normalizeBackwards();
\r
1321 m_bufferOffset_ --;
\r
1322 result = m_buffer_.charAt(m_bufferOffset_);
\r
1325 // fcd checks alway reset m_source_ to the limit of the FCD
\r
1326 m_source_.setIndex(startoffset);
\r
1332 * Determines if it is at the start of source iteration
\r
1333 * @return true if iterator at the start, false otherwise
\r
1335 private final boolean isBackwardsStart()
\r
1337 return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
\r
1338 || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
\r
1342 * Checks if iterator is at the end of its source string.
\r
1343 * @return true if it is at the end, false otherwise
\r
1345 private final boolean isEnd()
\r
1347 if (m_bufferOffset_ >= 0) {
\r
1348 if (m_bufferOffset_ != m_buffer_.length()) {
\r
1352 // at end of buffer. check if fcd is at the end
\r
1353 return m_FCDLimit_ == m_source_.getLength();
\r
1356 return m_source_.getLength() == m_source_.getIndex();
\r
1360 * <p>Special CE management for surrogates</p>
\r
1361 * <p>Lead surrogate is encountered. CE to be retrieved by using the
\r
1362 * following code unit. If next character is a trail surrogate, both
\r
1363 * characters will be combined to retrieve the CE, otherwise completely
\r
1364 * ignorable (UCA specification) is returned.</p>
\r
1365 * @param collator collator to use
\r
1366 * @param ce current CE
\r
1367 * @param trail character
\r
1368 * @return next CE for the surrogate characters
\r
1370 private final int nextSurrogate(RuleBasedCollator collator, int ce,
\r
1373 if (!UTF16.isTrailSurrogate(trail)) {
\r
1374 updateInternalState(m_utilSpecialBackUp_);
\r
1377 // TODO: CE contain the data from the previous CE + the mask.
\r
1378 // It should at least be unmasked
\r
1379 int result = collator.m_trie_.getTrailValue(ce, trail);
\r
1380 if (result == CE_NOT_FOUND_) {
\r
1381 updateInternalState(m_utilSpecialBackUp_);
\r
1387 * Gets the CE expansion offset
\r
1388 * @param collator current collator
\r
1389 * @param ce ce to test
\r
1390 * @return expansion offset
\r
1392 private int getExpansionOffset(RuleBasedCollator collator, int ce)
\r
1394 return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
\r
1399 * Gets the contraction ce offset
\r
1400 * @param collator current collator
\r
1401 * @param ce current ce
\r
1402 * @return contraction offset
\r
1404 private int getContractionOffset(RuleBasedCollator collator, int ce)
\r
1406 return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
\r
1410 * Checks if CE is a special tag CE
\r
1411 * @param ce to check
\r
1412 * @return true if CE is a special tag CE, false otherwise
\r
1414 private boolean isSpecialPrefixTag(int ce)
\r
1416 return RuleBasedCollator.isSpecial(ce) &&
\r
1417 RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
\r
1421 * <p>Special processing getting a CE that is preceded by a certain
\r
1423 * <p>Used for optimizing Japanese length and iteration marks. When a
\r
1424 * special processing tag is encountered, iterate backwards to see if
\r
1425 * there's a match.</p>
\r
1426 * <p>Contraction tables are used, prefix data is stored backwards in the
\r
1428 * @param collator collator to use
\r
1429 * @param ce current ce
\r
1430 * @param entrybackup entry backup iterator status
\r
1431 * @return next collation element
\r
1433 private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
\r
1434 Backup entrybackup)
\r
1436 backupInternalState(m_utilSpecialBackUp_);
\r
1437 updateInternalState(entrybackup);
\r
1439 // We want to look at the character where we entered
\r
1442 // This loop will run once per source string character, for as
\r
1443 // long as we are matching a potential contraction sequence
\r
1444 // First we position ourselves at the begining of contraction
\r
1446 int entryoffset = getContractionOffset(collator, ce);
\r
1447 int offset = entryoffset;
\r
1448 if (isBackwardsStart()) {
\r
1449 ce = collator.m_contractionCE_[offset];
\r
1452 char previous = (char)previousChar();
\r
1453 while (previous > collator.m_contractionIndex_[offset]) {
\r
1454 // contraction characters are ordered, skip smaller characters
\r
1458 if (previous == collator.m_contractionIndex_[offset]) {
\r
1459 // Found the source string char in the table.
\r
1460 // Pick up the corresponding CE from the table.
\r
1461 ce = collator.m_contractionCE_[offset];
\r
1464 // Source string char was not in the table, prefix not found
\r
1465 ce = collator.m_contractionCE_[entryoffset];
\r
1468 if (!isSpecialPrefixTag(ce)) {
\r
1469 // The source string char was in the contraction table, and
\r
1470 // the corresponding CE is not a prefix CE. We found the
\r
1471 // prefix, break out of loop, this CE will end up being
\r
1472 // returned. This is the normal way out of prefix handling
\r
1473 // when the source actually contained the prefix.
\r
1477 if (ce != CE_NOT_FOUND_) {
\r
1478 // we found something and we can merilly continue
\r
1479 updateInternalState(m_utilSpecialBackUp_);
\r
1481 else { // prefix search was a failure, we have to backup all the way to
\r
1483 updateInternalState(entrybackup);
\r
1489 * Checks if the ce is a contraction tag
\r
1490 * @param ce ce to check
\r
1491 * @return true if ce is a contraction tag, false otherwise
\r
1493 private boolean isContractionTag(int ce)
\r
1495 return RuleBasedCollator.isSpecial(ce) &&
\r
1496 RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
\r
1500 * Method to copy skipped characters into the buffer and sets the fcd
\r
1501 * position. To ensure that the skipped characters are considered later,
\r
1502 * we need to place it in the appropriate position in the buffer and
\r
1503 * reassign the source index. simple case if index reside in string,
\r
1504 * simply copy to buffer and fcdposition = pos, pos = start of buffer.
\r
1505 * if pos in normalization buffer, we'll insert the copy infront of pos
\r
1506 * and point pos to the start of the buffer. why am i doing these copies?
\r
1507 * well, so that the whole chunk of codes in the getNextCE,
\r
1508 * ucol_prv_getSpecialCE does not require any changes, which will be
\r
1510 * @param skipped character buffer
\r
1512 private void setDiscontiguous(StringBuffer skipped)
\r
1514 if (m_bufferOffset_ >= 0) {
\r
1515 m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
\r
1518 m_FCDLimit_ = m_source_.getIndex();
\r
1519 m_buffer_.setLength(0);
\r
1520 m_buffer_.append(skipped.toString());
\r
1523 m_bufferOffset_ = 0;
\r
1527 * Returns the current character for forward iteration
\r
1528 * @return current character
\r
1530 private int currentChar()
\r
1532 if (m_bufferOffset_ < 0) {
\r
1533 m_source_.previous();
\r
1534 return m_source_.next();
\r
1537 // m_bufferOffset_ is never 0 in normal circumstances except after a
\r
1538 // discontiguous contraction since it is always returned and moved
\r
1539 // by 1 when we do nextChar()
\r
1540 return m_buffer_.charAt(m_bufferOffset_ - 1);
\r
1544 * Method to get the discontiguous collation element within the source.
\r
1545 * Note this function will set the position to the appropriate places.
\r
1546 * Passed in character offset points to the second combining character
\r
1547 * after the start character.
\r
1548 * @param collator current collator used
\r
1549 * @param entryoffset index to the start character in the contraction table
\r
1550 * @return discontiguous collation element offset
\r
1552 private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
\r
1554 int offset = entryoffset;
\r
1555 boolean multicontraction = false;
\r
1556 // since it will be stuffed into this iterator and ran over again
\r
1557 if (m_utilSkippedBuffer_ == null) {
\r
1558 m_utilSkippedBuffer_ = new StringBuffer();
\r
1561 m_utilSkippedBuffer_.setLength(0);
\r
1563 char ch = (char)currentChar();
\r
1564 m_utilSkippedBuffer_.append((char)currentChar());
\r
1565 // accent after the first character
\r
1566 if (m_utilSpecialDiscontiguousBackUp_ == null) {
\r
1567 m_utilSpecialDiscontiguousBackUp_ = new Backup();
\r
1569 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
\r
1573 int ch_int = nextChar();
\r
1574 nextch = (char)ch_int;
\r
1575 if (ch_int == UCharacterIterator.DONE
\r
1576 || getCombiningClass(nextch) == 0) {
\r
1577 // if there are no more accents to move around
\r
1578 // we don't have to shift previousChar, since we are resetting
\r
1579 // the offset later
\r
1580 if (multicontraction) {
\r
1581 if (ch_int != UCharacterIterator.DONE) {
\r
1582 previousChar(); // backtrack
\r
1584 setDiscontiguous(m_utilSkippedBuffer_);
\r
1585 return collator.m_contractionCE_[offset];
\r
1590 offset ++; // skip the combining class offset
\r
1591 while ((offset < collator.m_contractionIndex_.length) &&
\r
1592 (nextch > collator.m_contractionIndex_[offset])) {
\r
1596 int ce = CE_NOT_FOUND_;
\r
1597 if ( offset >= collator.m_contractionIndex_.length) {
\r
1600 if ( nextch != collator.m_contractionIndex_[offset]
\r
1601 || getCombiningClass(nextch) == getCombiningClass(ch)) {
\r
1602 // unmatched or blocked character
\r
1603 if ( (m_utilSkippedBuffer_.length()!= 1) ||
\r
1604 ((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
\r
1605 (m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
\r
1606 m_utilSkippedBuffer_.append(nextch);
\r
1608 offset = entryoffset; // Restore the offset before checking next character.
\r
1612 ce = collator.m_contractionCE_[offset];
\r
1615 if (ce == CE_NOT_FOUND_) {
\r
1618 else if (isContractionTag(ce)) {
\r
1619 // this is a multi-contraction
\r
1620 offset = getContractionOffset(collator, ce);
\r
1621 if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
\r
1622 multicontraction = true;
\r
1623 backupInternalState(m_utilSpecialDiscontiguousBackUp_);
\r
1627 setDiscontiguous(m_utilSkippedBuffer_);
\r
1632 updateInternalState(m_utilSpecialDiscontiguousBackUp_);
\r
1633 // backup is one forward of the base character, we need to move back
\r
1636 return collator.m_contractionCE_[entryoffset];
\r
1640 * Gets the next contraction ce
\r
1641 * @param collator collator to use
\r
1642 * @param ce current ce
\r
1643 * @return ce of the next contraction
\r
1645 private int nextContraction(RuleBasedCollator collator, int ce)
\r
1647 backupInternalState(m_utilSpecialBackUp_);
\r
1648 int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
\r
1650 int entryoffset = getContractionOffset(collator, ce);
\r
1651 int offset = entryoffset;
\r
1654 ce = collator.m_contractionCE_[offset];
\r
1655 if (ce == CE_NOT_FOUND_) {
\r
1656 // back up the source over all the chars we scanned going
\r
1657 // into this contraction.
\r
1659 updateInternalState(m_utilSpecialBackUp_);
\r
1664 // get the discontiguos maximum combining class
\r
1665 int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
\r
1666 // checks if all characters have the same combining class
\r
1667 byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
\r
1668 char ch = (char)nextChar();
\r
1670 while (ch > collator.m_contractionIndex_[offset]) {
\r
1671 // contraction characters are ordered, skip all smaller
\r
1675 if (ch == collator.m_contractionIndex_[offset]) {
\r
1676 // Found the source string char in the contraction table.
\r
1677 // Pick up the corresponding CE from the table.
\r
1678 ce = collator.m_contractionCE_[offset];
\r
1681 // Source string char was not in contraction table.
\r
1682 // Unless it is a discontiguous contraction, we are done
\r
1684 if(UTF16.isLeadSurrogate(ch)) { // in order to do the proper detection, we
\r
1685 // need to see if we're dealing with a supplementary
\r
1686 miss = UCharacterProperty.getRawSupplementary(ch, (char) nextChar());
\r
1689 if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
\r
1690 || sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
\r
1692 // Contraction can not be discontiguous, back up by one
\r
1694 if(miss > 0xFFFF) {
\r
1697 ce = collator.m_contractionCE_[entryoffset];
\r
1700 // Contraction is possibly discontiguous.
\r
1701 // find the next character if ch is not a base character
\r
1702 int ch_int = nextChar();
\r
1703 if (ch_int != UCharacterIterator.DONE) {
\r
1706 char nextch = (char)ch_int;
\r
1707 if (getCombiningClass(nextch) == 0) {
\r
1709 if(miss > 0xFFFF) {
\r
1712 // base character not part of discontiguous contraction
\r
1713 ce = collator.m_contractionCE_[entryoffset];
\r
1716 ce = nextDiscontiguous(collator, entryoffset);
\r
1721 if (ce == CE_NOT_FOUND_) {
\r
1722 // source did not match the contraction, revert back original
\r
1723 updateInternalState(m_utilSpecialBackUp_);
\r
1728 // source was a contraction
\r
1729 if (!isContractionTag(ce)) {
\r
1733 // ccontinue looping to check for the remaining contraction.
\r
1734 if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
\r
1735 // there are further contractions to be performed, so we store
\r
1736 // the so-far completed ce, so that if we fail in the next
\r
1737 // round we just return this one.
\r
1738 entryce = collator.m_contractionCE_[entryoffset];
\r
1739 backupInternalState(m_utilSpecialBackUp_);
\r
1740 if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
\r
1741 m_utilSpecialBackUp_.m_bufferOffset_ --;
\r
1744 m_utilSpecialBackUp_.m_offset_ --;
\r
1752 * Gets the next ce for long primaries, stuffs the rest of the collation
\r
1753 * elements into the ce buffer
\r
1754 * @param ce current ce
\r
1757 private int nextLongPrimary(int ce)
\r
1759 m_CEBuffer_[1] = ((ce & 0xFF) << 24)
\r
1760 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
1761 m_CEBufferOffset_ = 1;
\r
1762 m_CEBufferSize_ = 2;
\r
1763 m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
\r
1765 return m_CEBuffer_[0];
\r
1769 * Gets the number of expansion
\r
1770 * @param ce current ce
\r
1771 * @return number of expansion
\r
1773 private int getExpansionCount(int ce)
\r
1779 * Gets the next expansion ce and stuffs the rest of the collation elements
\r
1780 * into the ce buffer
\r
1781 * @param collator current collator
\r
1782 * @param ce current ce
\r
1783 * @return next expansion ce
\r
1785 private int nextExpansion(RuleBasedCollator collator, int ce)
\r
1787 // NOTE: we can encounter both continuations and expansions in an
\r
1789 // I have to decide where continuations are going to be dealt with
\r
1790 int offset = getExpansionOffset(collator, ce);
\r
1791 m_CEBufferSize_ = getExpansionCount(ce);
\r
1792 m_CEBufferOffset_ = 1;
\r
1793 m_CEBuffer_[0] = collator.m_expansion_[offset];
\r
1794 if (m_CEBufferSize_ != 0) {
\r
1795 // if there are less than 16 elements in expansion
\r
1796 for (int i = 1; i < m_CEBufferSize_; i ++) {
\r
1797 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
\r
1801 // ce are terminated
\r
1802 m_CEBufferSize_ = 1;
\r
1803 while (collator.m_expansion_[offset] != 0) {
\r
1804 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
1805 collator.m_expansion_[++ offset];
\r
1808 // in case of one element expansion, we
\r
1809 // want to immediately return CEpos
\r
1810 if (m_CEBufferSize_ == 1) {
\r
1811 m_CEBufferSize_ = 0;
\r
1812 m_CEBufferOffset_ = 0;
\r
1814 return m_CEBuffer_[0];
\r
1818 * Gets the next digit ce
\r
1819 * @param collator current collator
\r
1820 * @param ce current collation element
\r
1821 * @param cp current codepoint
\r
1822 * @return next digit ce
\r
1824 private int nextDigit(RuleBasedCollator collator, int ce, int cp)
\r
1826 // We do a check to see if we want to collate digits as numbers;
\r
1827 // if so we generate a custom collation key. Otherwise we pull out
\r
1828 // the value stored in the expansion table.
\r
1830 if (m_collator_.m_isNumericCollation_){
\r
1831 int collateVal = 0;
\r
1832 int trailingZeroIndex = 0;
\r
1833 boolean nonZeroValReached = false;
\r
1835 // I just need a temporary place to store my generated CEs.
\r
1836 // icu4c uses a unsigned byte array, i'll use a stringbuffer here
\r
1837 // to avoid dealing with the sign problems and array allocation
\r
1838 // clear and set initial string buffer length
\r
1839 m_utilStringBuffer_.setLength(3);
\r
1841 // We parse the source string until we hit a char that's NOT a
\r
1843 // Use this u_charDigitValue. This might be slow because we have
\r
1844 // to handle surrogates...
\r
1845 int digVal = UCharacter.digit(cp);
\r
1846 // if we have arrived here, we have already processed possible
\r
1847 // supplementaries that trigered the digit tag -
\r
1848 // all supplementaries are marked in the UCA.
\r
1849 // We pad a zero in front of the first element anyways.
\r
1850 // This takes care of the (probably) most common case where
\r
1851 // people are sorting things followed by a single digit
\r
1854 // Make sure we have enough space.
\r
1855 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
\r
1856 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
\r
1859 // Skipping over leading zeroes.
\r
1860 if (digVal != 0 || nonZeroValReached) {
\r
1861 if (digVal != 0 && !nonZeroValReached) {
\r
1862 nonZeroValReached = true;
\r
1864 // We parse the digit string into base 100 numbers
\r
1865 // (this fits into a byte).
\r
1866 // We only add to the buffer in twos, thus if we are
\r
1867 // parsing an odd character, that serves as the
\r
1868 // 'tens' digit while the if we are parsing an even
\r
1869 // one, that is the 'ones' digit. We dumped the
\r
1870 // parsed base 100 value (collateVal) into a buffer.
\r
1871 // We multiply each collateVal by 2 (to give us room)
\r
1872 // and add 5 (to avoid overlapping magic CE byte
\r
1873 // values). The last byte we subtract 1 to ensure it is
\r
1874 // less than all the other bytes.
\r
1875 if (digIndx % 2 == 1) {
\r
1876 collateVal += digVal;
\r
1877 // This removes trailing zeroes.
\r
1878 if (collateVal == 0 && trailingZeroIndex == 0) {
\r
1879 trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
\r
1881 else if (trailingZeroIndex != 0) {
\r
1882 trailingZeroIndex = 0;
\r
1884 m_utilStringBuffer_.setCharAt(
\r
1885 ((digIndx - 1) >>> 1) + 2,
\r
1886 (char)((collateVal << 1) + 6));
\r
1890 // We drop the collation value into the buffer so if
\r
1891 // we need to do a "front patch" we don't have to
\r
1892 // check to see if we're hitting the last element.
\r
1893 collateVal = digVal * 10;
\r
1894 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
\r
1895 (char)((collateVal << 1) + 6));
\r
1900 // Get next character.
\r
1902 backupInternalState(m_utilSpecialBackUp_);
\r
1903 int char32 = nextChar();
\r
1904 char ch = (char)char32;
\r
1905 if (UTF16.isLeadSurrogate(ch)){
\r
1907 char trail = (char)nextChar();
\r
1908 if (UTF16.isTrailSurrogate(trail)) {
\r
1909 char32 = UCharacterProperty.getRawSupplementary(
\r
1918 digVal = UCharacter.digit(char32);
\r
1919 if (digVal == -1) {
\r
1920 // Resetting position to point to the next unprocessed
\r
1921 // char. We overshot it when doing our test/set for
\r
1923 updateInternalState(m_utilSpecialBackUp_);
\r
1932 if (nonZeroValReached == false){
\r
1934 m_utilStringBuffer_.setCharAt(2, (char)6);
\r
1937 int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
\r
1938 : (digIndx >>> 1) + 2;
\r
1939 if (digIndx % 2 != 0){
\r
1940 // We missed a value. Since digIndx isn't even, stuck too many
\r
1941 // values into the buffer (this is what we get for padding the
\r
1942 // first byte with a zero). "Front-patch" now by pushing all
\r
1943 // nybbles forward.
\r
1944 // Doing it this way ensures that at least 50% of the time
\r
1945 // (statistically speaking) we'll only be doing a single pass
\r
1946 // and optimizes for strings with single digits. I'm just
\r
1947 // assuming that's the more common case.
\r
1948 for (int i = 2; i < endIndex; i ++){
\r
1949 m_utilStringBuffer_.setCharAt(i,
\r
1950 (char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
\r
1952 + (((m_utilStringBuffer_.charAt(i + 1) - 6)
\r
1953 >>> 1) / 10) << 1) + 6));
\r
1958 // Subtract one off of the last byte.
\r
1959 m_utilStringBuffer_.setCharAt(endIndex - 1,
\r
1960 (char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
\r
1962 // We want to skip over the first two slots in the buffer.
\r
1963 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
\r
1964 // The second slot is for the sign/exponent byte:
\r
1965 // 0x80 + (decimalPos/2) & 7f.
\r
1966 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
\r
1967 m_utilStringBuffer_.setCharAt(1,
\r
1968 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
\r
1970 // Now transfer the collation key to our collIterate struct.
\r
1971 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
\r
1972 ce = (((m_utilStringBuffer_.charAt(0) << 8)
\r
1973 // Primary weight
\r
1974 | m_utilStringBuffer_.charAt(1))
\r
1975 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
\r
1976 // Secondary weight
\r
1977 | (RuleBasedCollator.BYTE_COMMON_
\r
1978 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
\r
1979 | RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
\r
1980 int i = 2; // Reset the index into the buffer.
\r
1982 m_CEBuffer_[0] = ce;
\r
1983 m_CEBufferSize_ = 1;
\r
1984 m_CEBufferOffset_ = 1;
\r
1985 while (i < endIndex)
\r
1987 int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
\r
1988 if (i < endIndex) {
\r
1989 primWeight |= m_utilStringBuffer_.charAt(i ++);
\r
1991 m_CEBuffer_[m_CEBufferSize_ ++]
\r
1992 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
\r
1993 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
1998 // no numeric mode, we'll just switch to whatever we stashed and
\r
2000 // find the offset to expansion table
\r
2001 return collator.m_expansion_[getExpansionOffset(collator, ce)];
\r
2005 * Gets the next implicit ce for codepoints
\r
2006 * @param codepoint current codepoint
\r
2007 * @return implicit ce
\r
2009 private int nextImplicit(int codepoint)
\r
2011 if (!UCharacter.isLegal(codepoint)) {
\r
2012 // synwee to check with vladimir on the range of isNonChar()
\r
2013 // illegal code value, use completely ignoreable!
\r
2016 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
\r
2017 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
\r
2019 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
\r
2020 m_CEBufferOffset_ = 1;
\r
2021 m_CEBufferSize_ = 2;
\r
2022 return m_CEBuffer_[0];
\r
2026 * Returns the next ce associated with the following surrogate characters
\r
2027 * @param ch current character
\r
2030 private int nextSurrogate(char ch)
\r
2032 int ch_int = nextChar();
\r
2033 char nextch = (char)ch_int;
\r
2034 if (ch_int != CharacterIterator.DONE &&
\r
2035 UTF16.isTrailSurrogate(nextch)) {
\r
2036 int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
\r
2037 return nextImplicit(codepoint);
\r
2039 if (nextch != CharacterIterator.DONE) {
\r
2040 previousChar(); // reverts back to the original position
\r
2042 return IGNORABLE; // completely ignorable
\r
2046 * Returns the next ce for a hangul character, this is an implicit
\r
2048 * @param collator current collator
\r
2049 * @param ch current character
\r
2050 * @return hangul ce
\r
2052 private int nextHangul(RuleBasedCollator collator, char ch)
\r
2054 char L = (char)(ch - HANGUL_SBASE_);
\r
2056 // divide into pieces
\r
2057 // do it in this order since some compilers can do % and / in one
\r
2059 char T = (char)(L % HANGUL_TCOUNT_);
\r
2060 L /= HANGUL_TCOUNT_;
\r
2061 char V = (char)(L % HANGUL_VCOUNT_);
\r
2062 L /= HANGUL_VCOUNT_;
\r
2065 L += HANGUL_LBASE_;
\r
2066 V += HANGUL_VBASE_;
\r
2067 T += HANGUL_TBASE_;
\r
2069 // return the first CE, but first put the rest into the expansion
\r
2071 m_CEBufferSize_ = 0;
\r
2072 if (!collator.m_isJamoSpecial_) { // FAST PATH
\r
2073 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2074 collator.m_trie_.getLeadValue(L);
\r
2075 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2076 collator.m_trie_.getLeadValue(V);
\r
2078 if (T != HANGUL_TBASE_) {
\r
2079 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2080 collator.m_trie_.getLeadValue(T);
\r
2082 m_CEBufferOffset_ = 1;
\r
2083 return m_CEBuffer_[0];
\r
2086 // Jamo is Special
\r
2087 // Since Hanguls pass the FCD check, it is guaranteed that we
\r
2088 // won't be in the normalization buffer if something like this
\r
2090 // Move Jamos into normalization buffer
\r
2091 m_buffer_.append((char)L);
\r
2092 m_buffer_.append((char)V);
\r
2093 if (T != HANGUL_TBASE_) {
\r
2094 m_buffer_.append((char)T);
\r
2096 m_FCDLimit_ = m_source_.getIndex();
\r
2097 m_FCDStart_ = m_FCDLimit_ - 1;
\r
2098 // Indicate where to continue in main input string after
\r
2099 // exhausting the buffer
\r
2105 * <p>Special CE management. Expansions, contractions etc...</p>
\r
2106 * @param collator can be plain UCA
\r
2107 * @param ce current ce
\r
2108 * @param ch current character
\r
2109 * @return next special ce
\r
2111 private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
\r
2113 int codepoint = ch;
\r
2114 Backup entrybackup = m_utilSpecialEntryBackUp_;
\r
2115 // this is to handle recursive looping
\r
2116 if (entrybackup != null) {
\r
2117 m_utilSpecialEntryBackUp_ = null;
\r
2120 entrybackup = new Backup();
\r
2122 backupInternalState(entrybackup);
\r
2123 try { // forces it to assign m_utilSpecialEntryBackup_
\r
2125 // This loop will repeat only in the case of contractions,
\r
2127 switch(RuleBasedCollator.getTag(ce)) {
\r
2128 case CE_NOT_FOUND_TAG_:
\r
2129 // impossible case for icu4j
\r
2131 case RuleBasedCollator.CE_SURROGATE_TAG_:
\r
2135 backupInternalState(m_utilSpecialBackUp_);
\r
2136 char trail = (char)nextChar();
\r
2137 ce = nextSurrogate(collator, ce, trail);
\r
2138 // calculate the supplementary code point value,
\r
2139 // if surrogate was not tailored we go one more round
\r
2141 UCharacterProperty.getRawSupplementary(ch, trail);
\r
2143 case CE_SPEC_PROC_TAG_:
\r
2144 ce = nextSpecialPrefix(collator, ce, entrybackup);
\r
2146 case CE_CONTRACTION_TAG_:
\r
2147 ce = nextContraction(collator, ce);
\r
2149 case CE_LONG_PRIMARY_TAG_:
\r
2150 return nextLongPrimary(ce);
\r
2151 case CE_EXPANSION_TAG_:
\r
2152 return nextExpansion(collator, ce);
\r
2153 case CE_DIGIT_TAG_:
\r
2154 ce = nextDigit(collator, ce, codepoint);
\r
2156 // various implicits optimization
\r
2157 case CE_CJK_IMPLICIT_TAG_:
\r
2158 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
\r
2159 return nextImplicit(codepoint);
\r
2160 case CE_IMPLICIT_TAG_: // everything that is not defined
\r
2161 return nextImplicit(codepoint);
\r
2162 case CE_TRAIL_SURROGATE_TAG_:
\r
2163 return IGNORABLE; // DC00-DFFF broken surrogate
\r
2164 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
\r
2165 return nextSurrogate(ch);
\r
2166 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
\r
2167 return nextHangul(collator, ch);
\r
2168 case CE_CHARSET_TAG_:
\r
2169 // not yet implemented probably after 1.8
\r
2170 return CE_NOT_FOUND_;
\r
2173 // synwee todo, throw exception or something here.
\r
2175 if (!RuleBasedCollator.isSpecial(ce)) {
\r
2181 m_utilSpecialEntryBackUp_ = entrybackup;
\r
2187 * Special processing is getting a CE that is preceded by a certain prefix.
\r
2188 * Currently this is only needed for optimizing Japanese length and
\r
2189 * iteration marks. When we encouter a special processing tag, we go
\r
2190 * backwards and try to see if we have a match. Contraction tables are used
\r
2191 * - so the whole process is not unlike contraction. prefix data is stored
\r
2192 * backwards in the table.
\r
2193 * @param collator current collator
\r
2194 * @param ce current ce
\r
2195 * @return previous ce
\r
2197 private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
\r
2199 backupInternalState(m_utilSpecialBackUp_);
\r
2201 // position ourselves at the begining of contraction sequence
\r
2202 int offset = getContractionOffset(collator, ce);
\r
2203 int entryoffset = offset;
\r
2204 if (isBackwardsStart()) {
\r
2205 ce = collator.m_contractionCE_[offset];
\r
2208 char prevch = (char)previousChar();
\r
2209 while (prevch > collator.m_contractionIndex_[offset]) {
\r
2210 // since contraction codepoints are ordered, we skip all that
\r
2214 if (prevch == collator.m_contractionIndex_[offset]) {
\r
2215 ce = collator.m_contractionCE_[offset];
\r
2218 // if there is a completely ignorable code point in the middle
\r
2219 // of a prefix, we need to act as if it's not there assumption:
\r
2220 // 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
\r
2222 // lone surrogates cannot be set to zero as it would break
\r
2223 // other processing
\r
2224 int isZeroCE = collator.m_trie_.getLeadValue(prevch);
\r
2225 // it's easy for BMP code points
\r
2226 if (isZeroCE == 0) {
\r
2229 else if (UTF16.isTrailSurrogate(prevch)
\r
2230 || UTF16.isLeadSurrogate(prevch)) {
\r
2231 // for supplementary code points, we have to check the next one
\r
2232 // situations where we are going to ignore
\r
2233 // 1. beginning of the string: schar is a lone surrogate
\r
2234 // 2. schar is a lone surrogate
\r
2235 // 3. schar is a trail surrogate in a valid surrogate
\r
2236 // sequence that is explicitly set to zero.
\r
2237 if (!isBackwardsStart()) {
\r
2238 char lead = (char)previousChar();
\r
2239 if (UTF16.isLeadSurrogate(lead)) {
\r
2240 isZeroCE = collator.m_trie_.getLeadValue(lead);
\r
2241 if (RuleBasedCollator.getTag(isZeroCE)
\r
2242 == RuleBasedCollator.CE_SURROGATE_TAG_) {
\r
2243 int finalCE = collator.m_trie_.getTrailValue(
\r
2246 if (finalCE == 0) {
\r
2247 // this is a real, assigned completely
\r
2248 // ignorable code point
\r
2254 nextChar(); // revert to original offset
\r
2255 // lone surrogate, completely ignorable
\r
2258 nextChar(); // revert to original offset
\r
2261 // lone surrogate at the beggining, completely ignorable
\r
2266 // char was not in the table. prefix not found
\r
2267 ce = collator.m_contractionCE_[entryoffset];
\r
2270 if (!isSpecialPrefixTag(ce)) {
\r
2271 // char was in the contraction table, and the corresponding ce
\r
2272 // is not a prefix ce. We found the prefix, break out of loop,
\r
2273 // this ce will end up being returned.
\r
2277 updateInternalState(m_utilSpecialBackUp_);
\r
2282 * Retrieves the previous contraction ce. To ensure that the backwards and
\r
2283 * forwards iteration matches, we take the current region of most possible
\r
2284 * match and pass it through the forward iteration. This will ensure that
\r
2285 * the obstinate problem of overlapping contractions will not occur.
\r
2286 * @param collator current collator
\r
2287 * @param ce current ce
\r
2288 * @param ch current character
\r
2289 * @return previous contraction ce
\r
2291 private int previousContraction(RuleBasedCollator collator, int ce, char ch)
\r
2293 m_utilStringBuffer_.setLength(0);
\r
2294 // since we might encounter normalized characters (from the thai
\r
2295 // processing) we can't use peekCharacter() here.
\r
2296 char prevch = (char)previousChar();
\r
2297 boolean atStart = false;
\r
2298 // TODO: address the comment above - maybe now we *can* use peekCharacter
\r
2299 //while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
\r
2300 while (collator.isUnsafe(ch)) {
\r
2301 m_utilStringBuffer_.insert(0, ch);
\r
2303 if (isBackwardsStart()) {
\r
2307 prevch = (char)previousChar();
\r
2310 // undo the previousChar() if we didn't reach the beginning
\r
2313 // adds the initial base character to the string
\r
2314 m_utilStringBuffer_.insert(0, ch);
\r
2316 // a new collation element iterator is used to simply things, since
\r
2317 // using the current collation element iterator will mean that the
\r
2318 // forward and backwards iteration will share and change the same
\r
2319 // buffers. it is going to be painful.
\r
2320 int originaldecomp = collator.getDecomposition();
\r
2321 // for faster access, since string would have been normalized above
\r
2322 collator.setDecomposition(Collator.NO_DECOMPOSITION);
\r
2323 if (m_utilColEIter_ == null) {
\r
2324 m_utilColEIter_ = new CollationElementIterator(
\r
2325 m_utilStringBuffer_.toString(),
\r
2329 m_utilColEIter_.m_collator_ = collator;
\r
2330 m_utilColEIter_.setText(m_utilStringBuffer_.toString());
\r
2332 ce = m_utilColEIter_.next();
\r
2333 m_CEBufferSize_ = 0;
\r
2334 while (ce != NULLORDER) {
\r
2335 if (m_CEBufferSize_ == m_CEBuffer_.length) {
\r
2337 // increasing cebuffer size
\r
2338 int tempbuffer[] = new int[m_CEBuffer_.length + 50];
\r
2339 System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
\r
2340 m_CEBuffer_.length);
\r
2341 m_CEBuffer_ = tempbuffer;
\r
2343 catch( MissingResourceException e)
\r
2347 catch (Exception e) {
\r
2349 e.printStackTrace();
\r
2354 m_CEBuffer_[m_CEBufferSize_ ++] = ce;
\r
2355 ce = m_utilColEIter_.next();
\r
2357 collator.setDecomposition(originaldecomp);
\r
2358 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
\r
2359 return m_CEBuffer_[m_CEBufferOffset_];
\r
2363 * Returns the previous long primary ces
\r
2364 * @param ce long primary ce
\r
2365 * @return previous long primary ces
\r
2367 private int previousLongPrimary(int ce)
\r
2369 m_CEBufferSize_ = 0;
\r
2370 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2371 ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
\r
2372 m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
\r
2373 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
2374 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
\r
2375 return m_CEBuffer_[m_CEBufferOffset_];
\r
2379 * Returns the previous expansion ces
\r
2380 * @param collator current collator
\r
2381 * @param ce current ce
\r
2382 * @return previous expansion ce
\r
2384 private int previousExpansion(RuleBasedCollator collator, int ce)
\r
2386 // find the offset to expansion table
\r
2387 int offset = getExpansionOffset(collator, ce);
\r
2388 m_CEBufferSize_ = getExpansionCount(ce);
\r
2389 if (m_CEBufferSize_ != 0) {
\r
2390 // less than 16 elements in expansion
\r
2391 for (int i = 0; i < m_CEBufferSize_; i ++) {
\r
2392 m_CEBuffer_[i] = collator.m_expansion_[offset + i];
\r
2397 // null terminated ces
\r
2398 while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
\r
2399 m_CEBuffer_[m_CEBufferSize_] =
\r
2400 collator.m_expansion_[offset + m_CEBufferSize_];
\r
2401 m_CEBufferSize_ ++;
\r
2404 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
\r
2405 return m_CEBuffer_[m_CEBufferOffset_];
\r
2409 * Getting the digit collation elements
\r
2411 * @param ce current collation element
\r
2412 * @param ch current code point
\r
2413 * @return digit collation element
\r
2415 private int previousDigit(RuleBasedCollator collator, int ce, char ch)
\r
2417 // We do a check to see if we want to collate digits as numbers; if so we generate
\r
2418 // a custom collation key. Otherwise we pull out the value stored in the expansion table.
\r
2419 if (m_collator_.m_isNumericCollation_){
\r
2420 int leadingZeroIndex = 0;
\r
2421 int collateVal = 0;
\r
2422 boolean nonZeroValReached = false;
\r
2424 // clear and set initial string buffer length
\r
2425 m_utilStringBuffer_.setLength(3);
\r
2427 // We parse the source string until we hit a char that's NOT a digit
\r
2428 // Use this u_charDigitValue. This might be slow because we have to
\r
2429 // handle surrogates...
\r
2431 if (UTF16.isTrailSurrogate(ch)) {
\r
2432 if (!isBackwardsStart()){
\r
2433 char lead = (char)previousChar();
\r
2434 if (UTF16.isLeadSurrogate(lead)) {
\r
2435 char32 = UCharacterProperty.getRawSupplementary(lead,
\r
2443 int digVal = UCharacter.digit(char32);
\r
2446 // Make sure we have enough space.
\r
2447 if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
\r
2448 m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
\r
2451 // Skipping over "trailing" zeroes but we still add to digIndx.
\r
2452 if (digVal != 0 || nonZeroValReached) {
\r
2453 if (digVal != 0 && !nonZeroValReached) {
\r
2454 nonZeroValReached = true;
\r
2457 // We parse the digit string into base 100 numbers (this
\r
2458 // fits into a byte).
\r
2459 // We only add to the buffer in twos, thus if we are
\r
2460 // parsing an odd character, that serves as the 'tens'
\r
2461 // digit while the if we are parsing an even one, that is
\r
2462 // the 'ones' digit. We dumped the parsed base 100 value
\r
2463 // (collateVal) into a buffer. We multiply each collateVal
\r
2464 // by 2 (to give us room) and add 5 (to avoid overlapping
\r
2465 // magic CE byte values). The last byte we subtract 1 to
\r
2466 // ensure it is less than all the other bytes.
\r
2467 // Since we're doing in this reverse we want to put the
\r
2468 // first digit encountered into the ones place and the
\r
2469 // second digit encountered into the tens place.
\r
2471 if (digIndx % 2 == 1){
\r
2472 collateVal += digVal * 10;
\r
2474 // This removes leading zeroes.
\r
2475 if (collateVal == 0 && leadingZeroIndex == 0) {
\r
2476 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
\r
2478 else if (leadingZeroIndex != 0) {
\r
2479 leadingZeroIndex = 0;
\r
2482 m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
\r
2483 (char)((collateVal << 1) + 6));
\r
2487 collateVal = digVal;
\r
2492 if (!isBackwardsStart()){
\r
2493 backupInternalState(m_utilSpecialBackUp_);
\r
2494 char32 = previousChar();
\r
2496 if (UTF16.isTrailSurrogate(ch)){
\r
2497 if (!isBackwardsStart()) {
\r
2498 char lead = (char)previousChar();
\r
2499 if (UTF16.isLeadSurrogate(lead)) {
\r
2501 = UCharacterProperty.getRawSupplementary(
\r
2505 updateInternalState(m_utilSpecialBackUp_);
\r
2510 digVal = UCharacter.digit(char32);
\r
2511 if (digVal == -1) {
\r
2512 updateInternalState(m_utilSpecialBackUp_);
\r
2521 if (nonZeroValReached == false) {
\r
2523 m_utilStringBuffer_.setCharAt(2, (char)6);
\r
2526 if (digIndx % 2 != 0) {
\r
2527 if (collateVal == 0 && leadingZeroIndex == 0) {
\r
2528 // This removes the leading 0 in a odd number sequence of
\r
2529 // numbers e.g. avery001
\r
2530 leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
\r
2533 // this is not a leading 0, we add it in
\r
2534 m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
\r
2535 (char)((collateVal << 1) + 6));
\r
2540 int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
\r
2541 : ((digIndx >>> 1) + 2) ;
\r
2542 digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
\r
2543 // Subtract one off of the last byte.
\r
2544 // Really the first byte here, but it's reversed...
\r
2545 m_utilStringBuffer_.setCharAt(2,
\r
2546 (char)(m_utilStringBuffer_.charAt(2) - 1));
\r
2547 // We want to skip over the first two slots in the buffer.
\r
2548 // The first slot is reserved for the header byte CODAN_PLACEHOLDER.
\r
2549 // The second slot is for the sign/exponent byte:
\r
2550 // 0x80 + (decimalPos/2) & 7f.
\r
2551 m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
\r
2552 m_utilStringBuffer_.setCharAt(1,
\r
2553 (char)(0x80 + ((digIndx >>> 1) & 0x7F)));
\r
2555 // Now transfer the collation key to our collIterate struct.
\r
2556 // The total size for our collation key is endIndx bumped up to the
\r
2557 // next largest even value divided by two.
\r
2558 m_CEBufferSize_ = 0;
\r
2559 m_CEBuffer_[m_CEBufferSize_ ++]
\r
2560 = (((m_utilStringBuffer_.charAt(0) << 8)
\r
2561 // Primary weight
\r
2562 | m_utilStringBuffer_.charAt(1))
\r
2563 << RuleBasedCollator.CE_PRIMARY_SHIFT_)
\r
2564 // Secondary weight
\r
2565 | (RuleBasedCollator.BYTE_COMMON_
\r
2566 << RuleBasedCollator.CE_SECONDARY_SHIFT_)
\r
2567 // Tertiary weight.
\r
2568 | RuleBasedCollator.BYTE_COMMON_;
\r
2569 int i = endIndex - 1; // Reset the index into the buffer.
\r
2571 int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
\r
2573 primWeight |= m_utilStringBuffer_.charAt(i --);
\r
2575 m_CEBuffer_[m_CEBufferSize_ ++]
\r
2576 = (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
\r
2577 | RuleBasedCollator.CE_CONTINUATION_MARKER_;
\r
2579 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
\r
2580 return m_CEBuffer_[m_CEBufferOffset_];
\r
2583 return collator.m_expansion_[getExpansionOffset(collator, ce)];
\r
2588 * Returns previous hangul ces
\r
2589 * @param collator current collator
\r
2590 * @param ch current character
\r
2591 * @return previous hangul ce
\r
2593 private int previousHangul(RuleBasedCollator collator, char ch)
\r
2595 char L = (char)(ch - HANGUL_SBASE_);
\r
2596 // we do it in this order since some compilers can do % and / in one
\r
2598 char T = (char)(L % HANGUL_TCOUNT_);
\r
2599 L /= HANGUL_TCOUNT_;
\r
2600 char V = (char)(L % HANGUL_VCOUNT_);
\r
2601 L /= HANGUL_VCOUNT_;
\r
2604 L += HANGUL_LBASE_;
\r
2605 V += HANGUL_VBASE_;
\r
2606 T += HANGUL_TBASE_;
\r
2608 m_CEBufferSize_ = 0;
\r
2609 if (!collator.m_isJamoSpecial_) {
\r
2610 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2611 collator.m_trie_.getLeadValue(L);
\r
2612 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2613 collator.m_trie_.getLeadValue(V);
\r
2614 if (T != HANGUL_TBASE_) {
\r
2615 m_CEBuffer_[m_CEBufferSize_ ++] =
\r
2616 collator.m_trie_.getLeadValue(T);
\r
2618 m_CEBufferOffset_ = m_CEBufferSize_ - 1;
\r
2619 return m_CEBuffer_[m_CEBufferOffset_];
\r
2622 // Since Hanguls pass the FCD check, it is guaranteed that we won't
\r
2623 // be in the normalization buffer if something like this happens
\r
2624 // Move Jamos into normalization buffer
\r
2625 m_buffer_.append(L);
\r
2626 m_buffer_.append(V);
\r
2627 if (T != HANGUL_TBASE_) {
\r
2628 m_buffer_.append(T);
\r
2631 m_FCDStart_ = m_source_.getIndex();
\r
2632 m_FCDLimit_ = m_FCDStart_ + 1;
\r
2638 * Gets implicit codepoint ces
\r
2639 * @param codepoint current codepoint
\r
2640 * @return implicit codepoint ces
\r
2642 private int previousImplicit(int codepoint)
\r
2644 if (!UCharacter.isLegal(codepoint)) {
\r
2645 return IGNORABLE; // illegal code value, completely ignoreable!
\r
2647 int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
\r
2648 m_CEBufferSize_ = 2;
\r
2649 m_CEBufferOffset_ = 1;
\r
2650 m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
\r
2652 m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
\r
2653 return m_CEBuffer_[1];
\r
2657 * Gets the previous surrogate ce
\r
2658 * @param ch current character
\r
2659 * @return previous surrogate ce
\r
2661 private int previousSurrogate(char ch)
\r
2663 if (isBackwardsStart()) {
\r
2664 // we are at the start of the string, wrong place to be at
\r
2667 char prevch = (char)previousChar();
\r
2668 // Handles Han and Supplementary characters here.
\r
2669 if (UTF16.isLeadSurrogate(prevch)) {
\r
2670 return previousImplicit(
\r
2671 UCharacterProperty.getRawSupplementary(prevch, ch));
\r
2673 if (prevch != CharacterIterator.DONE) {
\r
2676 return IGNORABLE; // completely ignorable
\r
2680 * <p>Special CE management. Expansions, contractions etc...</p>
\r
2681 * @param collator can be plain UCA
\r
2682 * @param ce current ce
\r
2683 * @param ch current character
\r
2684 * @return previous special ce
\r
2686 private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
\r
2689 // the only ces that loops are thai, special prefix and
\r
2691 switch (RuleBasedCollator.getTag(ce)) {
\r
2692 case CE_NOT_FOUND_TAG_: // this tag always returns
\r
2694 case RuleBasedCollator.CE_SURROGATE_TAG_:
\r
2695 // essentialy a disengaged lead surrogate. a broken
\r
2696 // sequence was encountered and this is an error
\r
2698 case CE_SPEC_PROC_TAG_:
\r
2699 ce = previousSpecialPrefix(collator, ce);
\r
2701 case CE_CONTRACTION_TAG_:
\r
2702 // may loop for first character e.g. "0x0f71" for english
\r
2703 if (isBackwardsStart()) {
\r
2704 // start of string or this is not the end of any contraction
\r
2705 ce = collator.m_contractionCE_[
\r
2706 getContractionOffset(collator, ce)];
\r
2709 return previousContraction(collator, ce, ch); // else
\r
2710 case CE_LONG_PRIMARY_TAG_:
\r
2711 return previousLongPrimary(ce);
\r
2712 case CE_EXPANSION_TAG_: // always returns
\r
2713 return previousExpansion(collator, ce);
\r
2714 case CE_DIGIT_TAG_:
\r
2715 ce = previousDigit(collator, ce, ch);
\r
2717 case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
\r
2718 return previousHangul(collator, ch);
\r
2719 case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
\r
2720 return IGNORABLE; // broken surrogate sequence
\r
2721 case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
\r
2722 return previousSurrogate(ch);
\r
2723 case CE_CJK_IMPLICIT_TAG_:
\r
2724 // 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
\r
2725 return previousImplicit(ch);
\r
2726 case CE_IMPLICIT_TAG_: // everything that is not defined
\r
2727 // UCA is filled with these. Tailorings are NOT_FOUND
\r
2728 return previousImplicit(ch);
\r
2729 case CE_CHARSET_TAG_: // this tag always returns
\r
2730 return CE_NOT_FOUND_;
\r
2731 default: // this tag always returns
\r
2734 if (!RuleBasedCollator.isSpecial(ce)) {
\r
2742 * GET IMPLICIT PRIMARY WEIGHTS
\r
2743 * @param cp codepoint
\r
2744 * @param value is left justified primary key
\r
2746 // private static final int getImplicitPrimary(int cp)
\r
2748 // cp = swapCJK(cp);
\r
2750 // //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
\r
2751 // // we now have a range of numbers from 0 to 21FFFF.
\r
2752 // // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
\r
2753 // // we must leave a gap of 01 between all values of the last byte, so
\r
2754 // // the last byte has 126 values (3 byte case)
\r
2755 // // we shift so that HAN all has the same first primary, for
\r
2756 // // compression.
\r
2757 // // for the 4 byte case, we make the gap as large as we can fit.
\r
2758 // // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
\r
2759 // // Four byte forms (most supplementaries) are EF xx xx xx (with a gap
\r
2760 // // of LAST2_MULTIPLIER == 14)
\r
2762 // int last0 = cp - RuleBasedCollator.IMPLICIT_4BYTE_BOUNDARY_;
\r
2763 // if (last0 < 0) {
\r
2764 // int last1 = cp / RuleBasedCollator.LAST_COUNT_;
\r
2765 // last0 = cp % RuleBasedCollator.LAST_COUNT_;
\r
2767 // int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
\r
2768 // last1 %= RuleBasedCollator.OTHER_COUNT_;
\r
2769 // return RuleBasedCollator.IMPLICIT_BASE_3BYTE_ + (last2 << 24)
\r
2770 // + (last1 << 16)
\r
2771 // + ((last0 * RuleBasedCollator.LAST_MULTIPLIER_) << 8);
\r
2774 // int last1 = last0 / RuleBasedCollator.LAST_COUNT2_;
\r
2775 // last0 %= RuleBasedCollator.LAST_COUNT2_;
\r
2777 // int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
\r
2778 // last1 %= RuleBasedCollator.OTHER_COUNT_;
\r
2780 // int last3 = last2 / RuleBasedCollator.OTHER_COUNT_;
\r
2781 // last2 %= RuleBasedCollator.OTHER_COUNT_;
\r
2782 // return RuleBasedCollator.IMPLICIT_BASE_4BYTE_ + (last3 << 24)
\r
2783 // + (last2 << 16) + (last1 << 8)
\r
2784 // + (last0 * RuleBasedCollator.LAST2_MULTIPLIER_);
\r
2789 // * Swapping CJK characters for implicit ces
\r
2790 // * @param cp codepoint CJK
\r
2791 // * @return swapped result
\r
2793 // private static final int swapCJK(int cp)
\r
2795 // if (cp >= CJK_BASE_) {
\r
2796 // if (cp < CJK_LIMIT_) {
\r
2797 // return cp - CJK_BASE_;
\r
2799 // if (cp < CJK_COMPAT_USED_BASE_) {
\r
2800 // return cp + NON_CJK_OFFSET_;
\r
2802 // if (cp < CJK_COMPAT_USED_LIMIT_) {
\r
2803 // return cp - CJK_COMPAT_USED_BASE_ + (CJK_LIMIT_ - CJK_BASE_);
\r
2805 // if (cp < CJK_B_BASE_) {
\r
2806 // return cp + NON_CJK_OFFSET_;
\r
2808 // if (cp < CJK_B_LIMIT_) {
\r
2809 // return cp; // non-BMP-CJK
\r
2811 // return cp + NON_CJK_OFFSET_; // non-CJK
\r
2813 // if (cp < CJK_A_BASE_) {
\r
2814 // return cp + NON_CJK_OFFSET_;
\r
2816 // if (cp < CJK_A_LIMIT_) {
\r
2817 // return cp - CJK_A_BASE_ + (CJK_LIMIT_ - CJK_BASE_)
\r
2818 // + (CJK_COMPAT_USED_LIMIT_ - CJK_COMPAT_USED_BASE_);
\r
2820 // return cp + NON_CJK_OFFSET_; // non-CJK
\r
2824 // * Gets a character from the source string at a given offset.
\r
2825 // * Handles both normal and iterative cases.
\r
2826 // * No error checking and does not access the normalization buffer
\r
2827 // * - caller beware!
\r
2828 // * @param offset offset from current position which character is to be
\r
2830 // * @return character at current position + offset
\r
2832 // private char peekCharacter(int offset)
\r
2834 // if (offset != 0) {
\r
2835 // int currentoffset = m_source_.getIndex();
\r
2836 // m_source_.setIndex(currentoffset + offset);
\r
2837 // char result = (char)m_source_.current();
\r
2838 // m_source_.setIndex(currentoffset);
\r
2842 // return (char)m_source_.current();
\r
2847 * Moves back 1 position in the source string. This is slightly less
\r
2848 * complicated than previousChar in that it doesn't normalize while
\r
2849 * moving back. Boundary checks are not performed.
\r
2850 * This method is to be used with caution, with the assumption that
\r
2851 * moving back one position will not exceed the source limits.
\r
2852 * Use only with nextChar() and never call this API twice in a row without
\r
2853 * nextChar() in the middle.
\r
2855 private void goBackOne()
\r
2857 if (m_bufferOffset_ >= 0) {
\r
2858 m_bufferOffset_ --;
\r
2861 m_source_.setIndex(m_source_.getIndex() - 1);
\r
2866 * Moves forward 1 position in the source string. This is slightly less
\r
2867 * complicated than nextChar in that it doesn't normalize while
\r
2868 * moving back. Boundary checks are not performed.
\r
2869 * This method is to be used with caution, with the assumption that
\r
2870 * moving back one position will not exceed the source limits.
\r
2871 * Use only with previousChar() and never call this API twice in a row
\r
2872 * without previousChar() in the middle.
\r
2874 private void goForwardOne()
\r
2876 if (m_bufferOffset_ < 0) {
\r
2877 // we're working on the source and not normalizing. fast path.
\r
2878 // note Thai pre-vowel reordering uses buffer too
\r
2879 m_source_.setIndex(m_source_.getIndex() + 1);
\r
2882 // we are in the buffer, buffer offset will never be 0 here
\r
2883 m_bufferOffset_ ++;
\r