2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
8 package com.ibm.icu.impl;
10 import java.util.NoSuchElementException;
12 import com.ibm.icu.lang.UCharacter;
13 import com.ibm.icu.text.UTF16;
14 import com.ibm.icu.util.RangeValueIterator;
17 * <p>Class enabling iteration of the values in a Trie.</p>
18 * <p>Result of each iteration contains the interval of codepoints that have
19 * the same value type and the value type itself.</p>
20 * <p>The comparison of each codepoint value is done via extract(), which the
21 * default implementation is to return the value as it is.</p>
22 * <p>Method extract() can be overwritten to perform manipulations on
23 * codepoint values in order to perform specialized comparison.</p>
24 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
25 * and the IntTrie, hence to accommodate both types of data, the return
26 * result will be in terms of int (32 bit) values.</p>
27 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
28 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
29 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
30 * sense, the caller will have to pass a object with a callback function
31 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
32 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
33 * codepoints with the same value as determined by
34 * UTrieEnumValue(const void *context, uint32_t value). for each range,
35 * utrie_enum calls the callback function to perform a task. In this way,
36 * icu4c performs the iteration within utrie_enum.
37 * To follow the JDK model, icu4j is slightly different from icu4c.
38 * Instead of requesting the caller to implement an object for a callback.
39 * The caller will have to implement a subclass of TrieIterator, fleshing out
40 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
41 * the caller will have to code his own iteration and flesh out the task
42 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
44 * <p>There are basically 3 usage scenarios for porting:</p>
45 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
46 * subclass of TrieIterator and override the extract(int) method. The
47 * extract(int) method is analogus to UTrieEnumValue callback.
49 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
50 * a subclass of TrieIterator, override the extract method and iterate, e.g
52 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
56 * class TrieIteratorImpl extends TrieIterator{
57 * public TrieIteratorImpl(Trie data){
60 * public int extract(int value){
61 * // port the implementation of _enumPropertyStartsValue here
65 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
66 * while(fcdIter.next(result)) {
67 * // port the implementation of _enumPropertyStartsRange
71 * <p>3) UTrieEnumRange is the only implemented callback then just implement
72 * the while loop, when utrie_enum is called
74 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
75 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
76 * while(fcdIter.next(result)){
77 * set.add(result.start);
81 * @see com.ibm.icu.impl.Trie
82 * @since release 2.1, Jan 17 2002
84 public class TrieIterator implements RangeValueIterator
87 // public constructor ---------------------------------------------
90 * TrieEnumeration constructor
91 * @param trie to be used
92 * @exception IllegalArgumentException throw when argument is null.
94 public TrieIterator(Trie trie)
97 throw new IllegalArgumentException(
98 "Argument trie cannot be null");
101 // synwee: check that extract belongs to the child class
102 m_initialValue_ = extract(m_trie_.getInitialValue());
106 // public methods -------------------------------------------------
109 * <p>Returns true if we are not at the end of the iteration, false
111 * <p>The next set of codepoints with the same value type will be
112 * calculated during this call and returned in the arguement element.</p>
113 * @param element return result
114 * @return true if we are not at the end of the iteration, false otherwise.
115 * @exception NoSuchElementException - if no more elements exist.
116 * @see com.ibm.icu.util.RangeValueIterator.Element
118 public final boolean next(Element element)
120 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
123 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
124 calculateNextBMPElement(element)) {
127 calculateNextSupplementaryElement(element);
132 * Resets the iterator to the beginning of the iteration
134 public final void reset()
136 m_currentCodepoint_ = 0;
137 m_nextCodepoint_ = 0;
139 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
140 if (m_nextBlock_ == m_trie_.m_dataOffset_) {
141 m_nextValue_ = m_initialValue_;
144 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
146 m_nextBlockIndex_ = 0;
147 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
150 // protected methods ----------------------------------------------
153 * Called by next() to extracts a 32 bit value from a trie value
154 * used for comparison.
155 * This method is to be overwritten if special manipulation is to be done
156 * to retrieve a relevant comparison.
157 * The default function is to return the value as it is.
158 * @param value a value from the trie
159 * @return extracted value
161 protected int extract(int value)
166 // private methods ------------------------------------------------
169 * Set the result values
170 * @param element return result object
171 * @param start codepoint of range
172 * @param limit (end + 1) codepoint of range
173 * @param value common value of range
175 private final void setResult(Element element, int start, int limit,
178 element.start = start;
179 element.limit = limit;
180 element.value = value;
184 * Finding the next element.
185 * This method is called just before returning the result of
187 * We always store the next element before it is requested.
188 * In the case that we have to continue calculations into the
189 * supplementary planes, a false will be returned.
190 * @param element return result object
191 * @return true if the next range is found, false if we have to proceed to
192 * the supplementary range.
194 private final boolean calculateNextBMPElement(Element element)
196 int currentValue = m_nextValue_;
197 m_currentCodepoint_ = m_nextCodepoint_;
199 m_nextBlockIndex_ ++;
200 if (!checkBlockDetail(currentValue)) {
201 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
205 // synwee check that next block index == 0 here
206 // enumerate BMP - the main loop enumerates data blocks
207 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
208 // because of the way the character is split to form the index
209 // the lead surrogate and trail surrogate can not be in the
211 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
212 // skip lead surrogate code units,
213 // go to lead surrogate codepoints
214 m_nextIndex_ = BMP_INDEX_LENGTH_;
216 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
217 // go back to regular BMP code points
218 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
223 m_nextBlockIndex_ = 0;
224 if (!checkBlock(currentValue)) {
225 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
230 m_nextCodepoint_ --; // step one back since this value has not been
231 m_nextBlockIndex_ --; // retrieved yet.
236 * Finds the next supplementary element.
237 * For each entry in the trie, the value to be delivered is passed through
239 * We always store the next element before it is requested.
240 * Called after calculateNextBMP() completes its round of BMP characters.
241 * There is a slight difference in the usage of m_currentCodepoint_
242 * here as compared to calculateNextBMP(). Though both represents the
243 * lower bound of the next element, in calculateNextBMP() it gets set
244 * at the start of any loop, where-else, in calculateNextSupplementary()
245 * since m_currentCodepoint_ already contains the lower bound of the
246 * next element (passed down from calculateNextBMP()), we keep it till
247 * the end before resetting it to the new value.
248 * Note, if there are no more iterations, it will never get to here.
249 * Blocked out by next().
250 * @param element return result object
252 private final void calculateNextSupplementaryElement(Element element)
254 int currentValue = m_nextValue_;
256 m_nextBlockIndex_ ++;
258 if (UTF16.getTrailSurrogate(m_nextCodepoint_)
259 != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
260 // this piece is only called when we are in the middle of a lead
262 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
263 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
265 m_currentCodepoint_ = m_nextCodepoint_;
268 // we have cleared one block
270 m_nextTrailIndexOffset_ ++;
271 if (!checkTrailBlock(currentValue)) {
272 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
274 m_currentCodepoint_ = m_nextCodepoint_;
278 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
279 // enumerate supplementary code points
280 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
281 // lead surrogate access
282 final int leadBlock =
283 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
284 Trie.INDEX_STAGE_2_SHIFT_;
285 if (leadBlock == m_trie_.m_dataOffset_) {
286 // no entries for a whole block of lead surrogates
287 if (currentValue != m_initialValue_) {
288 m_nextValue_ = m_initialValue_;
289 m_nextBlock_ = leadBlock; // == m_trie_.m_dataOffset_
290 m_nextBlockIndex_ = 0;
291 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
293 m_currentCodepoint_ = m_nextCodepoint_;
297 nextLead += DATA_BLOCK_LENGTH_;
298 // number of total affected supplementary codepoints in one
300 // this is not a simple addition of
301 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
302 // that we might have moved some of the codepoints
303 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
305 (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
308 if (m_trie_.m_dataManipulate_ == null) {
309 throw new NullPointerException(
310 "The field DataManipulate in this Trie is null");
312 // enumerate trail surrogates for this lead surrogate
313 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
314 m_trie_.getValue(leadBlock +
315 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
316 if (m_nextIndex_ <= 0) {
317 // no data for this lead surrogate
318 if (currentValue != m_initialValue_) {
319 m_nextValue_ = m_initialValue_;
320 m_nextBlock_ = m_trie_.m_dataOffset_;
321 m_nextBlockIndex_ = 0;
322 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
324 m_currentCodepoint_ = m_nextCodepoint_;
327 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
329 m_nextTrailIndexOffset_ = 0;
330 if (!checkTrailBlock(currentValue)) {
331 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
333 m_currentCodepoint_ = m_nextCodepoint_;
340 // deliver last range
341 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
346 * Internal block value calculations
347 * Performs calculations on a data block to find codepoints in m_nextBlock_
348 * after the index m_nextBlockIndex_ that has the same value.
349 * Note m_*_ variables at this point is the next codepoint whose value
350 * has not been calculated.
351 * But when returned with false, it will be the last codepoint whose
352 * value has been calculated.
353 * @param currentValue the value which other codepoints are tested against
354 * @return true if the whole block has the same value as currentValue or if
355 * the whole block has been calculated, false otherwise.
357 private final boolean checkBlockDetail(int currentValue)
359 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
360 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
362 if (m_nextValue_ != currentValue) {
365 ++ m_nextBlockIndex_;
372 * Internal block value calculations
373 * Performs calculations on a data block to find codepoints in m_nextBlock_
374 * that has the same value.
375 * Will call checkBlockDetail() if highlevel check fails.
376 * Note m_*_ variables at this point is the next codepoint whose value
377 * has not been calculated.
378 * @param currentBlock the initial block containing all currentValue
379 * @param currentValue the value which other codepoints are tested against
380 * @return true if the whole block has the same value as currentValue or if
381 * the whole block has been calculated, false otherwise.
383 private final boolean checkBlock(int currentValue)
385 int currentBlock = m_nextBlock_;
386 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
387 Trie.INDEX_STAGE_2_SHIFT_;
388 if (m_nextBlock_ == currentBlock &&
389 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
390 // the block is the same as the previous one, filled with
392 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
394 else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
395 // this is the all-initial-value block
396 if (currentValue != m_initialValue_) {
397 m_nextValue_ = m_initialValue_;
398 m_nextBlockIndex_ = 0;
401 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
404 if (!checkBlockDetail(currentValue)) {
412 * Internal block value calculations
413 * Performs calculations on multiple data blocks for a set of trail
414 * surrogates to find codepoints in m_nextBlock_ that has the same value.
415 * Will call checkBlock() for internal block checks.
416 * Note m_*_ variables at this point is the next codepoint whose value
417 * has not been calculated.
418 * @param currentValue the value which other codepoints are tested against
419 * @return true if the whole block has the same value as currentValue or if
420 * the whole block has been calculated, false otherwise.
422 private final boolean checkTrailBlock(int currentValue)
424 // enumerate code points for this lead surrogate
425 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
427 // if we ever reach here, we are at the start of a new block
428 m_nextBlockIndex_ = 0;
429 // copy of most of the body of the BMP loop
430 if (!checkBlock(currentValue)) {
433 m_nextTrailIndexOffset_ ++;
440 * Checks if we are beginning at the start of a initial block.
441 * If we are then the rest of the codepoints in this initial block
442 * has the same values.
443 * We increment m_nextCodepoint_ and relevant data members if so.
444 * This is used only in for the supplementary codepoints because
445 * the offset to the trail indexes could be 0.
446 * @return true if we are at the start of a initial block.
448 private final boolean checkNullNextTrailIndex()
450 if (m_nextIndex_ <= 0) {
451 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
452 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
454 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
455 Trie.INDEX_STAGE_2_SHIFT_;
456 if (m_trie_.m_dataManipulate_ == null) {
457 throw new NullPointerException(
458 "The field DataManipulate in this Trie is null");
460 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
461 m_trie_.getValue(leadBlock +
462 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
464 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
470 // private data members --------------------------------------------
473 * Size of the stage 1 BMP indexes
475 private static final int BMP_INDEX_LENGTH_ =
476 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
478 * Lead surrogate minimum value
480 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
482 * Trail surrogate minimum value
484 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
486 * Trail surrogate maximum value
488 //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
490 * Number of trail surrogate
492 private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
494 * Number of stage 1 indexes for supplementary calculations that maps to
495 * each lead surrogate character.
496 * See second pass into getRawOffset for the trail surrogate character.
497 * 10 for significant number of bits for trail surrogates, 5 for what we
498 * discard during shifting.
500 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
501 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
503 * Number of data values in a stage 2 (data array) block.
505 private static final int DATA_BLOCK_LENGTH_ =
506 1 << Trie.INDEX_STAGE_1_SHIFT_;
508 // * Number of codepoints in a stage 2 block
510 // private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
511 // DATA_BLOCK_LENGTH_ << 10;
515 private Trie m_trie_;
517 * Initial value for trie values
519 private int m_initialValue_;
521 * Next element results and data.
523 private int m_currentCodepoint_;
524 private int m_nextCodepoint_;
525 private int m_nextValue_;
526 private int m_nextIndex_;
527 private int m_nextBlock_;
528 private int m_nextBlockIndex_;
529 private int m_nextTrailIndexOffset_;