2 ******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 ******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
10 import com.ibm.icu.lang.UCharacter;
\r
11 import com.ibm.icu.text.UTF16;
\r
12 import com.ibm.icu.util.RangeValueIterator;
\r
15 * <p>Class enabling iteration of the values in a Trie.</p>
\r
16 * <p>Result of each iteration contains the interval of codepoints that have
\r
17 * the same value type and the value type itself.</p>
\r
18 * <p>The comparison of each codepoint value is done via extract(), which the
\r
19 * default implementation is to return the value as it is.</p>
\r
20 * <p>Method extract() can be overwritten to perform manipulations on
\r
21 * codepoint values in order to perform specialized comparison.</p>
\r
22 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
\r
23 * and the IntTrie, hence to accommodate both types of data, the return
\r
24 * result will be in terms of int (32 bit) values.</p>
\r
25 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
\r
26 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
\r
27 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
\r
28 * sense, the caller will have to pass a object with a callback function
\r
29 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
\r
30 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
\r
31 * codepoints with the same value as determined by
\r
32 * UTrieEnumValue(const void *context, uint32_t value). for each range,
\r
33 * utrie_enum calls the callback function to perform a task. In this way,
\r
34 * icu4c performs the iteration within utrie_enum.
\r
35 * To follow the JDK model, icu4j is slightly different from icu4c.
\r
36 * Instead of requesting the caller to implement an object for a callback.
\r
37 * The caller will have to implement a subclass of TrieIterator, fleshing out
\r
38 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
\r
39 * the caller will have to code his own iteration and flesh out the task
\r
40 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
\r
42 * <p>There are basically 3 usage scenarios for porting:</p>
\r
43 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
\r
44 * subclass of TrieIterator and override the extract(int) method. The
\r
45 * extract(int) method is analogus to UTrieEnumValue callback.
\r
47 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
\r
48 * a subclass of TrieIterator, override the extract method and iterate, e.g
\r
50 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
\r
54 * class TrieIteratorImpl extends TrieIterator{
\r
55 * public TrieIteratorImpl(Trie data){
\r
58 * public int extract(int value){
\r
59 * // port the implementation of _enumPropertyStartsValue here
\r
63 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
\r
64 * while(fcdIter.next(result)) {
\r
65 * // port the implementation of _enumPropertyStartsRange
\r
69 * <p>3) UTrieEnumRange is the only implemented callback then just implement
\r
70 * the while loop, when utrie_enum is called
\r
72 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
\r
73 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
\r
74 * while(fcdIter.next(result)){
\r
75 * set.add(result.start);
\r
79 * @see com.ibm.icu.impl.Trie
\r
80 * @see com.ibm.icu.lang.UCharacterTypeIterator
\r
81 * @since release 2.1, Jan 17 2002
\r
83 public class TrieIterator implements RangeValueIterator
\r
86 // public constructor ---------------------------------------------
\r
89 * TrieEnumeration constructor
\r
90 * @param trie to be used
\r
91 * @exception IllegalArgumentException throw when argument is null.
\r
93 public TrieIterator(Trie trie)
\r
96 throw new IllegalArgumentException(
\r
97 "Argument trie cannot be null");
\r
100 // synwee: check that extract belongs to the child class
\r
101 m_initialValue_ = extract(m_trie_.getInitialValue());
\r
105 // public methods -------------------------------------------------
\r
108 * <p>Returns true if we are not at the end of the iteration, false
\r
110 * <p>The next set of codepoints with the same value type will be
\r
111 * calculated during this call and returned in the arguement element.</p>
\r
112 * @param element return result
\r
113 * @return true if we are not at the end of the iteration, false otherwise.
\r
114 * @exception NoSuchElementException - if no more elements exist.
\r
115 * @see com.ibm.icu.util.RangeValueIterator.Element
\r
117 public final boolean next(Element element)
\r
119 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
\r
122 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
\r
123 calculateNextBMPElement(element)) {
\r
126 calculateNextSupplementaryElement(element);
\r
131 * Resets the iterator to the beginning of the iteration
\r
133 public final void reset()
\r
135 m_currentCodepoint_ = 0;
\r
136 m_nextCodepoint_ = 0;
\r
138 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
\r
139 if (m_nextBlock_ == 0) {
\r
140 m_nextValue_ = m_initialValue_;
\r
143 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
\r
145 m_nextBlockIndex_ = 0;
\r
146 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
\r
149 // protected methods ----------------------------------------------
\r
152 * Called by next() to extracts a 32 bit value from a trie value
\r
153 * used for comparison.
\r
154 * This method is to be overwritten if special manipulation is to be done
\r
155 * to retrieve a relevant comparison.
\r
156 * The default function is to return the value as it is.
\r
157 * @param value a value from the trie
\r
158 * @return extracted value
\r
160 protected int extract(int value)
\r
165 // private methods ------------------------------------------------
\r
168 * Set the result values
\r
169 * @param element return result object
\r
170 * @param start codepoint of range
\r
171 * @param limit (end + 1) codepoint of range
\r
172 * @param value common value of range
\r
174 private final void setResult(Element element, int start, int limit,
\r
177 element.start = start;
\r
178 element.limit = limit;
\r
179 element.value = value;
\r
183 * Finding the next element.
\r
184 * This method is called just before returning the result of
\r
186 * We always store the next element before it is requested.
\r
187 * In the case that we have to continue calculations into the
\r
188 * supplementary planes, a false will be returned.
\r
189 * @param element return result object
\r
190 * @return true if the next range is found, false if we have to proceed to
\r
191 * the supplementary range.
\r
193 private final boolean calculateNextBMPElement(Element element)
\r
195 int currentBlock = m_nextBlock_;
\r
196 int currentValue = m_nextValue_;
\r
197 m_currentCodepoint_ = m_nextCodepoint_;
\r
198 m_nextCodepoint_ ++;
\r
199 m_nextBlockIndex_ ++;
\r
200 if (!checkBlockDetail(currentValue)) {
\r
201 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
205 // synwee check that next block index == 0 here
\r
206 // enumerate BMP - the main loop enumerates data blocks
\r
207 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
\r
209 // because of the way the character is split to form the index
\r
210 // the lead surrogate and trail surrogate can not be in the
\r
212 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
\r
213 // skip lead surrogate code units,
\r
214 // go to lead surrogate codepoints
\r
215 m_nextIndex_ = BMP_INDEX_LENGTH_;
\r
217 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
\r
218 // go back to regular BMP code points
\r
219 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
\r
222 m_nextBlockIndex_ = 0;
\r
223 if (!checkBlock(currentBlock, currentValue)) {
\r
224 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
229 m_nextCodepoint_ --; // step one back since this value has not been
\r
230 m_nextBlockIndex_ --; // retrieved yet.
\r
235 * Finds the next supplementary element.
\r
236 * For each entry in the trie, the value to be delivered is passed through
\r
238 * We always store the next element before it is requested.
\r
239 * Called after calculateNextBMP() completes its round of BMP characters.
\r
240 * There is a slight difference in the usage of m_currentCodepoint_
\r
241 * here as compared to calculateNextBMP(). Though both represents the
\r
242 * lower bound of the next element, in calculateNextBMP() it gets set
\r
243 * at the start of any loop, where-else, in calculateNextSupplementary()
\r
244 * since m_currentCodepoint_ already contains the lower bound of the
\r
245 * next element (passed down from calculateNextBMP()), we keep it till
\r
246 * the end before resetting it to the new value.
\r
247 * Note, if there are no more iterations, it will never get to here.
\r
248 * Blocked out by next().
\r
249 * @param element return result object
\r
251 private final void calculateNextSupplementaryElement(Element element)
\r
253 int currentValue = m_nextValue_;
\r
254 int currentBlock = m_nextBlock_;
\r
255 m_nextCodepoint_ ++;
\r
256 m_nextBlockIndex_ ++;
\r
258 if (UTF16.getTrailSurrogate(m_nextCodepoint_)
\r
259 != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
\r
260 // this piece is only called when we are in the middle of a lead
\r
262 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
\r
263 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
265 m_currentCodepoint_ = m_nextCodepoint_;
\r
268 // we have cleared one block
\r
270 m_nextTrailIndexOffset_ ++;
\r
271 if (!checkTrailBlock(currentBlock, currentValue)) {
\r
272 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
274 m_currentCodepoint_ = m_nextCodepoint_;
\r
278 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
\r
279 // enumerate supplementary code points
\r
280 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
\r
281 // lead surrogate access
\r
283 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
\r
284 Trie.INDEX_STAGE_2_SHIFT_;
\r
285 if (leadBlock == m_trie_.m_dataOffset_) {
\r
286 // no entries for a whole block of lead surrogates
\r
287 if (currentValue != m_initialValue_) {
\r
288 m_nextValue_ = m_initialValue_;
\r
290 m_nextBlockIndex_ = 0;
\r
291 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
293 m_currentCodepoint_ = m_nextCodepoint_;
\r
297 nextLead += DATA_BLOCK_LENGTH_;
\r
298 // number of total affected supplementary codepoints in one
\r
300 // this is not a simple addition of
\r
301 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
\r
302 // that we might have moved some of the codepoints
\r
303 m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
\r
305 (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
\r
308 if (m_trie_.m_dataManipulate_ == null) {
\r
309 throw new NullPointerException(
\r
310 "The field DataManipulate in this Trie is null");
\r
312 // enumerate trail surrogates for this lead surrogate
\r
313 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
\r
314 m_trie_.getValue(leadBlock +
\r
315 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
\r
316 if (m_nextIndex_ <= 0) {
\r
317 // no data for this lead surrogate
\r
318 if (currentValue != m_initialValue_) {
\r
319 m_nextValue_ = m_initialValue_;
\r
321 m_nextBlockIndex_ = 0;
\r
322 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
324 m_currentCodepoint_ = m_nextCodepoint_;
\r
327 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
\r
329 m_nextTrailIndexOffset_ = 0;
\r
330 if (!checkTrailBlock(currentBlock, currentValue)) {
\r
331 setResult(element, m_currentCodepoint_, m_nextCodepoint_,
\r
333 m_currentCodepoint_ = m_nextCodepoint_;
\r
340 // deliver last range
\r
341 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
\r
346 * Internal block value calculations
\r
347 * Performs calculations on a data block to find codepoints in m_nextBlock_
\r
348 * after the index m_nextBlockIndex_ that has the same value.
\r
349 * Note m_*_ variables at this point is the next codepoint whose value
\r
350 * has not been calculated.
\r
351 * But when returned with false, it will be the last codepoint whose
\r
352 * value has been calculated.
\r
353 * @param currentValue the value which other codepoints are tested against
\r
354 * @return true if the whole block has the same value as currentValue or if
\r
355 * the whole block has been calculated, false otherwise.
\r
357 private final boolean checkBlockDetail(int currentValue)
\r
359 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
\r
360 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
\r
361 m_nextBlockIndex_));
\r
362 if (m_nextValue_ != currentValue) {
\r
365 ++ m_nextBlockIndex_;
\r
366 ++ m_nextCodepoint_;
\r
372 * Internal block value calculations
\r
373 * Performs calculations on a data block to find codepoints in m_nextBlock_
\r
374 * that has the same value.
\r
375 * Will call checkBlockDetail() if highlevel check fails.
\r
376 * Note m_*_ variables at this point is the next codepoint whose value
\r
377 * has not been calculated.
\r
378 * @param currentBlock the initial block containing all currentValue
\r
379 * @param currentValue the value which other codepoints are tested against
\r
380 * @return true if the whole block has the same value as currentValue or if
\r
381 * the whole block has been calculated, false otherwise.
\r
383 private final boolean checkBlock(int currentBlock, int currentValue)
\r
385 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
\r
386 Trie.INDEX_STAGE_2_SHIFT_;
\r
387 if (m_nextBlock_ == currentBlock &&
\r
388 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
\r
389 // the block is the same as the previous one, filled with
\r
391 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
\r
393 else if (m_nextBlock_ == 0) {
\r
394 // this is the all-initial-value block
\r
395 if (currentValue != m_initialValue_) {
\r
396 m_nextValue_ = m_initialValue_;
\r
397 m_nextBlockIndex_ = 0;
\r
400 m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
\r
403 if (!checkBlockDetail(currentValue)) {
\r
411 * Internal block value calculations
\r
412 * Performs calculations on multiple data blocks for a set of trail
\r
413 * surrogates to find codepoints in m_nextBlock_ that has the same value.
\r
414 * Will call checkBlock() for internal block checks.
\r
415 * Note m_*_ variables at this point is the next codepoint whose value
\r
416 * has not been calculated.
\r
417 * @param currentBlock the initial block containing all currentValue
\r
418 * @param currentValue the value which other codepoints are tested against
\r
419 * @return true if the whole block has the same value as currentValue or if
\r
420 * the whole block has been calculated, false otherwise.
\r
422 private final boolean checkTrailBlock(int currentBlock,
\r
425 // enumerate code points for this lead surrogate
\r
426 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
\r
428 // if we ever reach here, we are at the start of a new block
\r
429 m_nextBlockIndex_ = 0;
\r
430 // copy of most of the body of the BMP loop
\r
431 if (!checkBlock(currentBlock, currentValue)) {
\r
434 m_nextTrailIndexOffset_ ++;
\r
441 * Checks if we are beginning at the start of a initial block.
\r
442 * If we are then the rest of the codepoints in this initial block
\r
443 * has the same values.
\r
444 * We increment m_nextCodepoint_ and relevant data members if so.
\r
445 * This is used only in for the supplementary codepoints because
\r
446 * the offset to the trail indexes could be 0.
\r
447 * @return true if we are at the start of a initial block.
\r
449 private final boolean checkNullNextTrailIndex()
\r
451 if (m_nextIndex_ <= 0) {
\r
452 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
\r
453 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
\r
455 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
\r
456 Trie.INDEX_STAGE_2_SHIFT_;
\r
457 if (m_trie_.m_dataManipulate_ == null) {
\r
458 throw new NullPointerException(
\r
459 "The field DataManipulate in this Trie is null");
\r
461 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
\r
462 m_trie_.getValue(leadBlock +
\r
463 (nextLead & Trie.INDEX_STAGE_3_MASK_)));
\r
465 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
\r
471 // private data members --------------------------------------------
\r
474 * Size of the stage 1 BMP indexes
\r
476 private static final int BMP_INDEX_LENGTH_ =
\r
477 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
\r
479 * Lead surrogate minimum value
\r
481 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
\r
483 * Trail surrogate minimum value
\r
485 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
\r
487 * Trail surrogate maximum value
\r
489 //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
\r
491 * Number of trail surrogate
\r
493 private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
\r
495 * Number of stage 1 indexes for supplementary calculations that maps to
\r
496 * each lead surrogate character.
\r
497 * See second pass into getRawOffset for the trail surrogate character.
\r
498 * 10 for significant number of bits for trail surrogates, 5 for what we
\r
499 * discard during shifting.
\r
501 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
\r
502 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
\r
504 * Number of data values in a stage 2 (data array) block.
\r
506 private static final int DATA_BLOCK_LENGTH_ =
\r
507 1 << Trie.INDEX_STAGE_1_SHIFT_;
\r
509 // * Number of codepoints in a stage 2 block
\r
511 // private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
\r
512 // DATA_BLOCK_LENGTH_ << 10;
\r
516 private Trie m_trie_;
\r
518 * Initial value for trie values
\r
520 private int m_initialValue_;
\r
522 * Next element results and data.
\r
524 private int m_currentCodepoint_;
\r
525 private int m_nextCodepoint_;
\r
526 private int m_nextValue_;
\r
527 private int m_nextIndex_;
\r
528 private int m_nextBlock_;
\r
529 private int m_nextBlockIndex_;
\r
530 private int m_nextTrailIndexOffset_;
\r