2 *******************************************************************************
\r
3 * Copyright (C) 2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
12 * A Trie2Writable is a modifiable, or build-time Trie2.
\r
13 * Functions for reading data from the Trie are all from class Trie2.
\r
16 public class Trie2Writable extends Trie2 {
\r
20 * Create a new, empty, writable Trie2. 32-bit data values are used.
\r
22 * @param initialValueP the initial value that is set for all code points
\r
23 * @param errorValueP the value for out-of-range code points and illegal UTF-8
\r
25 public Trie2Writable(int initialValueP, int errorValueP) {
\r
26 // This constructor corresponds to utrie2_open() in ICU4C.
\r
27 init(initialValueP, errorValueP);
\r
31 private void init(int initialValueP, int errorValueP) {
\r
32 this.initialValue = initialValueP;
\r
33 this.errorValue = errorValueP;
\r
34 this.highStart = 0x110000;
\r
36 this.data = new int[UNEWTRIE2_INITIAL_DATA_LENGTH];
\r
37 this.dataCapacity = UNEWTRIE2_INITIAL_DATA_LENGTH;
\r
38 this.initialValue = initialValueP;
\r
39 this.errorValue = errorValueP;
\r
40 this.highStart = 0x110000;
\r
41 this.firstFreeBlock = 0; /* no free block in the list */
\r
42 this.isCompacted = false;
\r
45 * preallocate and reset
\r
47 * - the bad-UTF-8-data block
\r
48 * - the null data block
\r
51 for(i=0; i<0x80; ++i) {
\r
52 data[i] = initialValue;
\r
54 for(; i<0xc0; ++i) {
\r
55 data[i] = errorValue;
\r
57 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
\r
58 data[i] = initialValue;
\r
60 dataNullOffset = UNEWTRIE2_DATA_NULL_OFFSET;
\r
61 dataLength = UNEWTRIE2_DATA_START_OFFSET;
\r
63 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
\r
64 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
\r
69 /* reference counts for the bad-UTF-8-data block */
\r
70 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
\r
75 * Reference counts for the null data block: all blocks except for the ASCII blocks.
\r
76 * Plus 1 so that we don't drop this block during compaction.
\r
77 * Plus as many as needed for lead surrogate code points.
\r
79 /* i==newTrie->dataNullOffset */
\r
81 (0x110000>>UTRIE2_SHIFT_2) -
\r
82 (0x80>>UTRIE2_SHIFT_2) +
\r
84 UTRIE2_LSCP_INDEX_2_LENGTH;
\r
85 j += UTRIE2_DATA_BLOCK_LENGTH;
\r
86 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
\r
91 * set the remaining indexes in the BMP index-2 block
\r
92 * to the null data block
\r
94 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
\r
95 index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
\r
99 * Fill the index gap with impossible values so that compaction
\r
100 * does not overlap other index-2 blocks with the gap.
\r
102 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
\r
103 index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
\r
106 /* set the indexes in the null index-2 block */
\r
107 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
\r
108 index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
\r
110 index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
\r
111 index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
\r
113 /* set the index-1 indexes for the linear index-2 block */
\r
115 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
\r
116 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
\r
121 /* set the remaining index-1 indexes to the null index-2 block */
\r
122 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
\r
123 index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
\r
127 * Preallocate and reset data for U+0080..U+07ff,
\r
128 * for 2-byte UTF-8 which will be compacted in 64-blocks
\r
129 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
\r
131 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
\r
132 set(i, initialValue);
\r
139 * Create a new build time (modifiable) Trie2 whose contents are the same as the source Trie2.
\r
141 * @param source the source Trie2. Its contents will be copied into the new Trie2.
\r
143 public Trie2Writable(Trie2 source) {
\r
144 init(source.initialValue, source.errorValue);
\r
146 for (Range r: source) {
\r
152 private boolean isInNullBlock(int c, boolean forLSCP) {
\r
155 if(Character.isHighSurrogate((char)c) && forLSCP) {
\r
156 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
\r
157 (c>>UTRIE2_SHIFT_2);
\r
159 i2=index1[c>>UTRIE2_SHIFT_1]+
\r
160 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
\r
163 return (block==dataNullOffset);
\r
166 private int allocIndex2Block() {
\r
167 int newBlock, newTop;
\r
169 newBlock=index2Length;
\r
170 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
171 if(newTop > index2.length) {
\r
172 throw new IllegalStateException("Internal error in Trie2 creation.");
\r
174 * Should never occur.
\r
175 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
\r
176 * or the code writes more values than should be possible.
\r
179 index2Length=newTop;
\r
180 System.arraycopy(index2, index2NullOffset, index2, newBlock, UTRIE2_INDEX_2_BLOCK_LENGTH);
\r
184 private int getIndex2Block(int c, boolean forLSCP) {
\r
187 if(c>=0xd800 && c<0xdc00 && forLSCP) {
\r
188 return UTRIE2_LSCP_INDEX_2_OFFSET;
\r
191 i1=c>>UTRIE2_SHIFT_1;
\r
193 if(i2==index2NullOffset) {
\r
194 i2=allocIndex2Block();
\r
200 private int allocDataBlock(int copyBlock) {
\r
201 int newBlock, newTop;
\r
203 if(firstFreeBlock!=0) {
\r
204 /* get the first free block */
\r
205 newBlock=firstFreeBlock;
\r
206 firstFreeBlock=-map[newBlock>>UTRIE2_SHIFT_2];
\r
208 /* get a new block from the high end */
\r
209 newBlock=dataLength;
\r
210 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
\r
211 if(newTop>dataCapacity) {
\r
212 /* out of memory in the data array */
\r
216 if(dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
\r
217 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
\r
218 } else if(dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
\r
219 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
\r
222 * Should never occur.
\r
223 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
\r
224 * or the code writes more values than should be possible.
\r
226 throw new IllegalStateException("Internal error in Trie2 creation.");
\r
228 newData = new int[capacity];
\r
229 System.arraycopy(data, 0, newData, 0, dataLength);
\r
231 dataCapacity=capacity;
\r
235 System.arraycopy(data, copyBlock, data, newBlock, UTRIE2_DATA_BLOCK_LENGTH);
\r
236 map[newBlock>>UTRIE2_SHIFT_2]=0;
\r
241 /* call when the block's reference counter reaches 0 */
\r
242 private void releaseDataBlock(int block) {
\r
243 /* put this block at the front of the free-block chain */
\r
244 map[block>>UTRIE2_SHIFT_2]=-firstFreeBlock;
\r
245 firstFreeBlock=block;
\r
249 private boolean isWritableBlock(int block) {
\r
250 return (block!=dataNullOffset && 1==map[block>>UTRIE2_SHIFT_2]);
\r
253 private void setIndex2Entry(int i2, int block) {
\r
255 ++map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
\r
256 oldBlock=index2[i2];
\r
257 if(0 == --map[oldBlock>>UTRIE2_SHIFT_2]) {
\r
258 releaseDataBlock(oldBlock);
\r
265 * No error checking for illegal arguments.
\r
269 private int getDataBlock(int c, boolean forLSCP) {
\r
270 int i2, oldBlock, newBlock;
\r
272 i2=getIndex2Block(c, forLSCP);
\r
274 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
\r
275 oldBlock=index2[i2];
\r
276 if(isWritableBlock(oldBlock)) {
\r
280 /* allocate a new data block */
\r
281 newBlock=allocDataBlock(oldBlock);
\r
282 setIndex2Entry(i2, newBlock);
\r
286 * Set a value for a code point.
\r
288 * @param c the code point
\r
289 * @param value the value
\r
291 public Trie2Writable set(int c, int value) {
\r
292 if (c<0 || c>0x10ffff) {
\r
293 throw new IllegalArgumentException("Invalid code point.");
\r
295 set(c, true, value);
\r
300 private Trie2Writable set(int c, boolean forLSCP, int value) {
\r
305 block = getDataBlock(c, forLSCP);
\r
306 data[block + (c&UTRIE2_DATA_MASK)] = value;
\r
312 * Uncompact a compacted Trie2Writable.
\r
313 * This is needed if a the WritableTrie2 was compacted in preparation for creating a read-only
\r
314 * Trie2, and then is subsequently altered.
\r
316 * The structure is a bit awkward - it would be cleaner to leave the original
\r
317 * Trie2 unaltered - but compacting in place was taken directly from the ICU4C code.
\r
319 * The approach is to create a new (uncompacted) Trie2Writable from this one, then transfer
\r
320 * the guts from the new to the old.
\r
322 private void uncompact() {
\r
323 Trie2Writable tempTrie = new Trie2Writable(this);
\r
325 // Members from Trie2Writable
\r
326 this.index1 = tempTrie.index1;
\r
327 this.index2 = tempTrie.index2;
\r
328 this.data = tempTrie.data;
\r
329 this.index2Length = tempTrie.index2Length;
\r
330 this.dataCapacity = tempTrie.dataCapacity;
\r
331 this.isCompacted = tempTrie.isCompacted;
\r
333 // Members From Trie2
\r
334 this.header = tempTrie.header;
\r
335 this.index = tempTrie.index;
\r
336 this.data16 = tempTrie.data16;
\r
337 this.data32 = tempTrie.data32;
\r
338 this.indexLength = tempTrie.indexLength;
\r
339 this.dataLength = tempTrie.dataLength;
\r
340 this.index2NullOffset = tempTrie.index2NullOffset;
\r
341 this.initialValue = tempTrie.initialValue;
\r
342 this.errorValue = tempTrie.errorValue;
\r
343 this.highStart = tempTrie.highStart;
\r
344 this.highValueIndex = tempTrie.highValueIndex;
\r
345 this.dataNullOffset = tempTrie.dataNullOffset;
\r
349 private void writeBlock(int block, int value) {
\r
350 int limit=block+UTRIE2_DATA_BLOCK_LENGTH;
\r
351 while(block<limit) {
\r
352 data[block++]=value;
\r
357 * initialValue is ignored if overwrite=TRUE
\r
360 private void fillBlock(int block, /*UChar32*/ int start, /*UChar32*/ int limit,
\r
361 int value, int initialValue, boolean overwrite) {
\r
363 int pLimit = block+limit;
\r
365 for (i=block+start; i<pLimit; i++) {
\r
369 for (i=block+start; i<pLimit; i++) {
\r
370 if(data[i]==initialValue) {
\r
378 * Set a value in a range of code points [start..end].
\r
379 * All code points c with start<=c<=end will get the value if
\r
380 * overwrite is TRUE or if the old value is the initial value.
\r
382 * @param start the first code point to get the value
\r
383 * @param end the last code point to get the value (inclusive)
\r
384 * @param value the value
\r
385 * @param overwrite flag for whether old non-initial values are to be overwritten
\r
387 public Trie2Writable setRange(int start, int end,
\r
388 int value, boolean overwrite) {
\r
390 * repeat value in [start..end]
\r
391 * mark index values for repeat-data blocks by setting bit 31 of the index values
\r
392 * fill around existing values if any, if(overwrite)
\r
394 int block, rest, repeatBlock;
\r
395 int /*UChar32*/ limit;
\r
397 if(start>0x10ffff || start<0 || end>0x10ffff || end<0 || start>end) {
\r
398 throw new IllegalArgumentException("Invalid code point range.");
\r
400 if(!overwrite && value==initialValue) {
\r
401 return this; /* nothing to do */
\r
409 if((start&UTRIE2_DATA_MASK) != 0) {
\r
410 int /*UChar32*/ nextStart;
\r
412 /* set partial block at [start..following block boundary[ */
\r
413 block=getDataBlock(start, true);
\r
415 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
\r
416 if(nextStart<=limit) {
\r
417 fillBlock(block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
\r
418 value, initialValue, overwrite);
\r
421 fillBlock(block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
\r
422 value, initialValue, overwrite);
\r
427 /* number of positions in the last, partial block */
\r
428 rest=limit&UTRIE2_DATA_MASK;
\r
430 /* round down limit to a block boundary */
\r
431 limit&=~UTRIE2_DATA_MASK;
\r
433 /* iterate over all-value blocks */
\r
434 if(value==initialValue) {
\r
435 repeatBlock=dataNullOffset;
\r
440 while(start<limit) {
\r
442 boolean setRepeatBlock=false;
\r
444 if(value==initialValue && isInNullBlock(start, true)) {
\r
445 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
\r
449 /* get index value */
\r
450 i2=getIndex2Block(start, true);
\r
451 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
\r
453 if(isWritableBlock(block)) {
\r
454 /* already allocated */
\r
455 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
\r
457 * We overwrite all values, and it's not a
\r
458 * protected (ASCII-linear or 2-byte UTF-8) block:
\r
459 * replace with the repeatBlock.
\r
461 setRepeatBlock=true;
\r
463 /* !overwrite, or protected block: just write the values into this block */
\r
465 0, UTRIE2_DATA_BLOCK_LENGTH,
\r
466 value, initialValue, overwrite);
\r
468 } else if(data[block]!=value && (overwrite || block==dataNullOffset)) {
\r
470 * Set the repeatBlock instead of the null block or previous repeat block:
\r
472 * If !isWritableBlock() then all entries in the block have the same value
\r
473 * because it's the null block or a range block (the repeatBlock from a previous
\r
474 * call to utrie2_setRange32()).
\r
475 * No other blocks are used multiple times before compacting.
\r
477 * The null block is the only non-writable block with the initialValue because
\r
478 * of the repeatBlock initialization above. (If value==initialValue, then
\r
479 * the repeatBlock will be the null data block.)
\r
481 * We set our repeatBlock if the desired value differs from the block's value,
\r
482 * and if we overwrite any data or if the data is all initial values
\r
483 * (which is the same as the block being the null block, see above).
\r
485 setRepeatBlock=true;
\r
487 if(setRepeatBlock) {
\r
488 if(repeatBlock>=0) {
\r
489 setIndex2Entry(i2, repeatBlock);
\r
491 /* create and set and fill the repeatBlock */
\r
492 repeatBlock=getDataBlock(start, true);
\r
493 writeBlock(repeatBlock, value);
\r
497 start+=UTRIE2_DATA_BLOCK_LENGTH;
\r
501 /* set partial block at [last block boundary..limit[ */
\r
502 block=getDataBlock(start, true);
\r
503 fillBlock(block, 0, rest, value, initialValue, overwrite);
\r
510 * Set the values from a Trie2.Range.
\r
512 * All code points within the range will get the value if
\r
513 * overwrite is TRUE or if the old value is the initial value.
\r
515 * Ranges with the lead surrogate flag set will set the alternate
\r
516 * lead-surrogate values in the Trie, rather than the code point values.
\r
518 * This function is intended to work with the ranges produced when iterating
\r
519 * the contents of a source Trie.
\r
521 * @param range contains the range of code points and the value to be set.
\r
522 * @param overwrite flag for whether old non-initial values are to be overwritten
\r
524 public Trie2Writable setRange(Trie2.Range range, boolean overwrite) {
\r
526 if (range.leadSurrogate) {
\r
527 for (int c=range.startCodePoint; c<=range.endCodePoint; c++) {
\r
528 if (overwrite || getFromU16SingleLead((char)c) == this.initialValue) {
\r
529 setForLeadSurrogateCodeUnit((char)c, range.value);
\r
533 setRange(range.startCodePoint, range.endCodePoint, range.value, overwrite);
\r
539 * Set a value for a UTF-16 code unit.
\r
540 * Note that a Trie2 stores separate values for
\r
541 * supplementary code points in the lead surrogate range
\r
542 * (accessed via the plain set() and get() interfaces)
\r
543 * and for lead surrogate code units.
\r
545 * The lead surrogate code unit values are set via this function and
\r
546 * read by the function getFromU16SingleLead().
\r
548 * For code units outside of the lead surrogate range, this function
\r
549 * behaves identically to set().
\r
551 * @param codeUnit A UTF-16 code unit.
\r
552 * @param value the value to be stored in the Trie2.
\r
554 public Trie2Writable setForLeadSurrogateCodeUnit(char codeUnit, int value) {
\r
556 set(codeUnit, false, value);
\r
562 * Get the value for a code point as stored in the Trie2.
\r
564 * @param codePoint the code point
\r
565 * @return the value
\r
568 public int get(int codePoint) {
\r
569 if (codePoint<0 || codePoint>0x10ffff) {
\r
572 return get(codePoint, true);
\r
577 private int get(int c, boolean fromLSCP) {
\r
580 if(c>=highStart && (!(c>=0xd800 && c<0xdc00) || fromLSCP)) {
\r
581 return data[dataLength-UTRIE2_DATA_GRANULARITY];
\r
584 if((c>=0xd800 && c<0xdc00) && fromLSCP) {
\r
585 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
\r
586 (c>>UTRIE2_SHIFT_2);
\r
588 i2=index1[c>>UTRIE2_SHIFT_1]+
\r
589 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
\r
592 return data[block+(c&UTRIE2_DATA_MASK)];
\r
596 * Get a trie value for a UTF-16 code unit.
\r
598 * This function returns the same value as get() if the input
\r
599 * character is outside of the lead surrogate range
\r
601 * There are two values stored in a Trie for inputs in the lead
\r
602 * surrogate range. This function returns the alternate value,
\r
603 * while Trie2.get() returns the main value.
\r
605 * @param c the code point or lead surrogate value.
\r
606 * @return the value
\r
609 public int getFromU16SingleLead(char c) {
\r
610 return get(c, false);
\r
613 /* compaction --------------------------------------------------------------- */
\r
615 private boolean equal_int(int[] a, int s, int t, int length) {
\r
616 for (int i=0; i<length; i++) {
\r
617 if (a[s+i] != a[t+i]) {
\r
625 private int findSameIndex2Block(int index2Length, int otherBlock) {
\r
628 /* ensure that we do not even partially get past index2Length */
\r
629 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
631 for(block=0; block<=index2Length; ++block) {
\r
632 if(equal_int(index2, block, otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
\r
640 private int findSameDataBlock(int dataLength, int otherBlock, int blockLength) {
\r
643 /* ensure that we do not even partially get past dataLength */
\r
644 dataLength-=blockLength;
\r
646 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
\r
647 if(equal_int(data, block, otherBlock, blockLength)) {
\r
655 * Find the start of the last range in the trie by enumerating backward.
\r
656 * Indexes for supplementary code points higher than this will be omitted.
\r
658 private int findHighStart(int highValue) {
\r
662 int i1, i2, j, i2Block, prevI2Block, block, prevBlock;
\r
665 /* set variables for previous range */
\r
666 if(highValue==initialValue) {
\r
667 prevI2Block=index2NullOffset;
\r
668 prevBlock=dataNullOffset;
\r
675 /* enumerate index-2 blocks */
\r
676 i1=UNEWTRIE2_INDEX_1_LENGTH;
\r
679 i2Block=index1[--i1];
\r
680 if(i2Block==prevI2Block) {
\r
681 /* the index-2 block is the same as the previous one, and filled with highValue */
\r
682 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
\r
685 prevI2Block=i2Block;
\r
686 if(i2Block==index2NullOffset) {
\r
687 /* this is the null index-2 block */
\r
688 if(highValue!=initialValue) {
\r
691 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
\r
693 /* enumerate data blocks for one index-2 block */
\r
694 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
\r
695 block=index2[i2Block+ --i2];
\r
696 if(block==prevBlock) {
\r
697 /* the block is the same as the previous one, and filled with highValue */
\r
698 c-=UTRIE2_DATA_BLOCK_LENGTH;
\r
702 if(block==dataNullOffset) {
\r
703 /* this is the null data block */
\r
704 if(highValue!=initialValue) {
\r
707 c-=UTRIE2_DATA_BLOCK_LENGTH;
\r
709 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
\r
710 value=data[block+ --j];
\r
711 if(value!=highValue) {
\r
721 /* deliver last range */
\r
726 * Compact a build-time trie.
\r
729 * - removes blocks that are identical with earlier ones
\r
730 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
\r
731 * - moves blocks in steps of the data granularity
\r
732 * - moves and overlaps blocks that overlap with multiple values in the overlap region
\r
735 * - try to move and overlap blocks that are not already adjacent
\r
737 private void compactData() {
\r
738 int start, newStart, movedStart;
\r
739 int blockLength, overlap;
\r
740 int i, mapIndex, blockCount;
\r
742 /* do not compact linear-ASCII data */
\r
743 newStart=UTRIE2_DATA_START_OFFSET;
\r
744 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
\r
749 * Start with a block length of 64 for 2-byte UTF-8,
\r
750 * then switch to UTRIE2_DATA_BLOCK_LENGTH.
\r
753 blockCount=blockLength>>UTRIE2_SHIFT_2;
\r
754 for(start=newStart; start<dataLength;) {
\r
756 * start: index of first entry of current block
\r
757 * newStart: index where the current block is to be moved
\r
758 * (right after current end of already-compacted data)
\r
760 if(start==UNEWTRIE2_DATA_0800_OFFSET) {
\r
761 blockLength=UTRIE2_DATA_BLOCK_LENGTH;
\r
765 /* skip blocks that are not used */
\r
766 if(map[start>>UTRIE2_SHIFT_2]<=0) {
\r
767 /* advance start to the next block */
\r
768 start+=blockLength;
\r
770 /* leave newStart with the previous block! */
\r
774 /* search for an identical block */
\r
775 movedStart=findSameDataBlock(newStart, start, blockLength);
\r
776 if(movedStart >= 0) {
\r
777 /* found an identical block, set the other block's index value for the current block */
\r
778 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
\r
779 map[mapIndex++]=movedStart;
\r
780 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
\r
783 /* advance start to the next block */
\r
784 start+=blockLength;
\r
786 /* leave newStart with the previous block! */
\r
790 /* see if the beginning of this block can be overlapped with the end of the previous block */
\r
791 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
\r
792 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
\r
793 overlap>0 && !equal_int(data, (newStart-overlap), start, overlap);
\r
794 overlap-=UTRIE2_DATA_GRANULARITY) {}
\r
796 if(overlap>0 || newStart<start) {
\r
797 /* some overlap, or just move the whole block */
\r
798 movedStart=newStart-overlap;
\r
799 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
\r
800 map[mapIndex++]=movedStart;
\r
801 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
\r
804 /* move the non-overlapping indexes to their new positions */
\r
806 for(i=blockLength-overlap; i>0; --i) {
\r
807 data[newStart++]=data[start++];
\r
809 } else /* no overlap && newStart==start */ {
\r
810 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
\r
811 map[mapIndex++]=start;
\r
812 start+=UTRIE2_DATA_BLOCK_LENGTH;
\r
818 /* now adjust the index-2 table */
\r
819 for(i=0; i<index2Length; ++i) {
\r
820 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
\r
821 /* Gap indexes are invalid (-1). Skip over the gap. */
\r
822 i+=UNEWTRIE2_INDEX_GAP_LENGTH;
\r
824 index2[i]=map[index2[i]>>UTRIE2_SHIFT_2];
\r
826 dataNullOffset=map[dataNullOffset>>UTRIE2_SHIFT_2];
\r
828 /* ensure dataLength alignment */
\r
829 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
\r
830 data[newStart++]=initialValue;
\r
833 if (UTRIE2_DEBUG) {
\r
834 /* we saved some space */
\r
835 System.out.printf("compacting UTrie2: count of 32-bit data words %d->%d\n",
\r
836 dataLength, newStart);
\r
839 dataLength=newStart;
\r
842 private void compactIndex2() {
\r
843 int i, start, newStart, movedStart, overlap;
\r
845 /* do not compact linear-BMP index-2 blocks */
\r
846 newStart=UTRIE2_INDEX_2_BMP_LENGTH;
\r
847 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
\r
851 /* Reduce the index table gap to what will be needed at runtime. */
\r
852 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((highStart-0x10000)>>UTRIE2_SHIFT_1);
\r
854 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<index2Length;) {
\r
856 * start: index of first entry of current block
\r
857 * newStart: index where the current block is to be moved
\r
858 * (right after current end of already-compacted data)
\r
861 /* search for an identical block */
\r
862 if( (movedStart=findSameIndex2Block(newStart, start))
\r
865 /* found an identical block, set the other block's index value for the current block */
\r
866 map[start>>UTRIE2_SHIFT_1_2]=movedStart;
\r
868 /* advance start to the next block */
\r
869 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
871 /* leave newStart with the previous block! */
\r
875 /* see if the beginning of this block can be overlapped with the end of the previous block */
\r
876 /* look for maximum overlap with the previous, adjacent block */
\r
877 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
\r
878 overlap>0 && !equal_int(index2, newStart-overlap, start, overlap);
\r
881 if(overlap>0 || newStart<start) {
\r
882 /* some overlap, or just move the whole block */
\r
883 map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
\r
885 /* move the non-overlapping indexes to their new positions */
\r
887 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
\r
888 index2[newStart++]=index2[start++];
\r
890 } else /* no overlap && newStart==start */ {
\r
891 map[start>>UTRIE2_SHIFT_1_2]=start;
\r
892 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
897 /* now adjust the index-1 table */
\r
898 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
\r
899 index1[i]=map[index1[i]>>UTRIE2_SHIFT_1_2];
\r
901 index2NullOffset=map[index2NullOffset>>UTRIE2_SHIFT_1_2];
\r
904 * Ensure data table alignment:
\r
905 * Needs to be granularity-aligned for 16-bit trie
\r
906 * (so that dataMove will be down-shiftable),
\r
907 * and 2-aligned for uint32_t data.
\r
909 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
\r
910 /* Arbitrary value: 0x3fffc not possible for real data. */
\r
911 index2[newStart++]=0x0000ffff<<UTRIE2_INDEX_SHIFT;
\r
914 if (UTRIE2_DEBUG) {
\r
915 /* we saved some space */
\r
916 System.out.printf("compacting UTrie2: count of 16-bit index-2 words %d->%d\n",
\r
917 index2Length, newStart);
\r
920 index2Length=newStart;
\r
923 private void compactTrie() {
\r
924 int localHighStart;
\r
928 /* find highStart and round it up */
\r
929 highValue=get(0x10ffff);
\r
930 localHighStart=findHighStart(highValue);
\r
931 localHighStart=(localHighStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
\r
932 if(localHighStart==0x110000) {
\r
933 highValue=errorValue;
\r
937 * Set trie->highStart only after utrie2_get32(trie, highStart).
\r
938 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
\r
940 this.highStart=localHighStart;
\r
942 if (UTRIE2_DEBUG) {
\r
943 System.out.printf("UTrie2: highStart U+%04x highValue 0x%x initialValue 0x%x\n",
\r
944 highStart, highValue, initialValue);
\r
947 if(highStart<0x110000) {
\r
948 /* Blank out [highStart..10ffff] to release associated data blocks. */
\r
949 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
\r
950 setRange(suppHighStart, 0x10ffff, initialValue, true);
\r
954 if(highStart>0x10000) {
\r
957 if (UTRIE2_DEBUG) {
\r
958 System.out.printf("UTrie2: highStart U+%04x count of 16-bit index-2 words %d->%d\n",
\r
959 highStart, index2Length, UTRIE2_INDEX_1_OFFSET);
\r
964 * Store the highValue in the data array and round up the dataLength.
\r
965 * Must be done after compactData() because that assumes that dataLength
\r
966 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
\r
968 data[dataLength++]=highValue;
\r
969 while((dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
\r
970 data[dataLength++]=initialValue;
\r
978 * Produce an optimized, read-only Trie2_16 from this writable Trie.
\r
979 * The data values outside of the range that will fit in a 16 bit
\r
980 * unsigned value will be truncated.
\r
982 public Trie2_16 toTrie2_16() {
\r
983 Trie2_16 frozenTrie = new Trie2_16();
\r
984 freeze(frozenTrie, ValueWidth.BITS_16);
\r
990 * Produce an optimized, read-only Trie2_32 from this writable Trie.
\r
993 public Trie2_32 toTrie2_32() {
\r
994 Trie2_32 frozenTrie = new Trie2_32();
\r
995 freeze(frozenTrie, ValueWidth.BITS_32);
\r
1001 * Maximum length of the runtime index array.
\r
1002 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
\r
1003 * (The actual maximum length is lower,
\r
1004 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
\r
1006 private static final int UTRIE2_MAX_INDEX_LENGTH = 0xffff;
\r
1009 * Maximum length of the runtime data array.
\r
1010 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
\r
1011 * and by uint16_t UTrie2Header.shiftedDataLength.
\r
1013 private static final int UTRIE2_MAX_DATA_LENGTH = 0xffff<<UTRIE2_INDEX_SHIFT;
\r
1015 /* Compact the data and then populate an optimized read-only Trie. */
\r
1016 private void freeze(Trie2 dest, ValueWidth valueBits) {
\r
1018 int allIndexesLength;
\r
1019 int dataMove; /* >0 if the data is moved to the end of the index array */
\r
1022 /* compact if necessary */
\r
1023 if(!isCompacted) {
\r
1027 if(highStart<=0x10000) {
\r
1028 allIndexesLength=UTRIE2_INDEX_1_OFFSET;
\r
1030 allIndexesLength=index2Length;
\r
1032 if(valueBits==ValueWidth.BITS_16) {
\r
1033 dataMove=allIndexesLength;
\r
1038 /* are indexLength and dataLength within limits? */
\r
1039 if( /* for unshifted indexLength */
\r
1040 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
\r
1041 /* for unshifted dataNullOffset */
\r
1042 (dataMove+dataNullOffset)>0xffff ||
\r
1043 /* for unshifted 2-byte UTF-8 index-2 values */
\r
1044 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
\r
1045 /* for shiftedDataLength */
\r
1046 (dataMove+dataLength)>UTRIE2_MAX_DATA_LENGTH) {
\r
1047 throw new UnsupportedOperationException("Trie2 data is too large.");
\r
1050 /* calculate the sizes of, and allocate, the index and data arrays */
\r
1051 int indexLength = allIndexesLength;
\r
1052 if (valueBits==ValueWidth.BITS_16) {
\r
1053 indexLength += dataLength;
\r
1055 dest.data32 = new int[dataLength];
\r
1057 dest.index = new char[indexLength];
\r
1059 dest.indexLength = allIndexesLength;
\r
1060 dest.dataLength = dataLength;
\r
1061 if(highStart<=0x10000) {
\r
1062 dest.index2NullOffset = 0xffff;
\r
1064 dest.index2NullOffset = UTRIE2_INDEX_2_OFFSET + index2NullOffset;
\r
1066 dest.initialValue = initialValue;
\r
1067 dest.errorValue = errorValue;
\r
1068 dest.highStart = highStart;
\r
1069 dest.highValueIndex = dataMove + dataLength - UTRIE2_DATA_GRANULARITY;
\r
1070 dest.dataNullOffset = (dataMove+dataNullOffset);
\r
1072 // Create a header and set the its fields.
\r
1073 // (This is only used in the event that we serialize the Trie, but is
\r
1074 // convenient to do here.)
\r
1075 dest.header = new Trie2.UTrie2Header();
\r
1076 dest.header.signature = 0x54726932; /* "Tri2" */
\r
1077 dest.header.options = valueBits==ValueWidth.BITS_16 ? 0 : 1;
\r
1078 dest.header.indexLength = dest.indexLength;
\r
1079 dest.header.shiftedDataLength = dest.dataLength>>UTRIE2_INDEX_SHIFT;
\r
1080 dest.header.index2NullOffset = dest.index2NullOffset;
\r
1081 dest.header.dataNullOffset = dest.dataNullOffset;
\r
1082 dest.header.shiftedHighStart = dest.highStart>>UTRIE2_SHIFT_1;
\r
1086 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
\r
1088 for(i=0; i<UTRIE2_INDEX_2_BMP_LENGTH; i++) {
\r
1089 dest.index[destIdx++] = (char)((index2[i]+dataMove) >> UTRIE2_INDEX_SHIFT);
\r
1091 if (UTRIE2_DEBUG) {
\r
1092 System.out.println("\n\nIndex2 for BMP limit is " + Integer.toHexString(destIdx));
\r
1095 /* write UTF-8 2-byte index-2 values, not right-shifted */
\r
1096 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
\r
1097 dest.index[destIdx++] = (char)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
\r
1099 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
\r
1100 dest.index[destIdx++]=(char)(dataMove+index2[i<<(6-UTRIE2_SHIFT_2)]);
\r
1102 if (UTRIE2_DEBUG) {
\r
1103 System.out.println("Index2 for UTF-8 2byte values limit is " + Integer.toHexString(destIdx));
\r
1106 if(highStart>0x10000) {
\r
1107 int index1Length = (highStart-0x10000)>>UTRIE2_SHIFT_1;
\r
1108 int index2Offset = UTRIE2_INDEX_2_BMP_LENGTH + UTRIE2_UTF8_2B_INDEX_2_LENGTH + index1Length;
\r
1110 /* write 16-bit index-1 values for supplementary code points */
\r
1111 //p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
\r
1112 for(i=0; i<index1Length; i++) {
\r
1113 //*dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
\r
1114 dest.index[destIdx++] = (char)(UTRIE2_INDEX_2_OFFSET + index1[i+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH]);
\r
1116 if (UTRIE2_DEBUG) {
\r
1117 System.out.println("Index 1 for supplementals, limit is " + Integer.toHexString(destIdx));
\r
1121 * write the index-2 array values for supplementary code points,
\r
1122 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
\r
1124 for(i=0; i<index2Length-index2Offset; i++) {
\r
1125 dest.index[destIdx++] = (char)((dataMove + index2[index2Offset+i])>>UTRIE2_INDEX_SHIFT);
\r
1127 if (UTRIE2_DEBUG) {
\r
1128 System.out.println("Index 2 for supplementals, limit is " + Integer.toHexString(destIdx));
\r
1132 /* write the 16/32-bit data array */
\r
1133 switch(valueBits) {
\r
1135 /* write 16-bit data values */
\r
1136 assert(destIdx == dataMove);
\r
1137 dest.data16 = destIdx;
\r
1138 for(i=0; i<dataLength; i++) {
\r
1139 dest.index[destIdx++] = (char)data[i];
\r
1143 /* write 32-bit data values */
\r
1144 for (i=0; i<dataLength; i++) {
\r
1145 dest.data32[i] = this.data[i];
\r
1149 // The writable, but compressed, Trie2 stays around unless the caller drops its references to it.
\r
1153 /* Start with allocation of 16k data entries. */
\r
1154 private static final int UNEWTRIE2_INITIAL_DATA_LENGTH = 1<<14;
\r
1156 /* Grow about 8x each time. */
\r
1157 private static final int UNEWTRIE2_MEDIUM_DATA_LENGTH = 1<<17;
\r
1159 /** The null index-2 block, following the gap in the index-2 table. */
\r
1160 private static final int UNEWTRIE2_INDEX_2_NULL_OFFSET = UNEWTRIE2_INDEX_GAP_OFFSET + UNEWTRIE2_INDEX_GAP_LENGTH;
\r
1162 /** The start of allocated index-2 blocks. */
\r
1163 private static final int UNEWTRIE2_INDEX_2_START_OFFSET = UNEWTRIE2_INDEX_2_NULL_OFFSET + UTRIE2_INDEX_2_BLOCK_LENGTH;
\r
1166 * The null data block.
\r
1167 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
\r
1168 * to work with 6-bit trail bytes from 2-byte UTF-8.
\r
1170 private static final int UNEWTRIE2_DATA_NULL_OFFSET = UTRIE2_DATA_START_OFFSET;
\r
1172 /** The start of allocated data blocks. */
\r
1173 private static final int UNEWTRIE2_DATA_START_OFFSET = UNEWTRIE2_DATA_NULL_OFFSET+0x40;
\r
1176 * The start of data blocks for U+0800 and above.
\r
1177 * Below, compaction uses a block length of 64 for 2-byte UTF-8.
\r
1178 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
\r
1179 * Data values for 0x780 code points beyond ASCII.
\r
1181 private static final int UNEWTRIE2_DATA_0800_OFFSET = UNEWTRIE2_DATA_START_OFFSET+0x780;
\r
1184 // Private data members. From struct UNewTrie2 in ICU4C
\r
1186 private int[] index1 = new int[UNEWTRIE2_INDEX_1_LENGTH];
\r
1187 private int[] index2 = new int[UNEWTRIE2_MAX_INDEX_2_LENGTH];
\r
1188 private int[] data;
\r
1190 private int index2Length;
\r
1191 private int dataCapacity;
\r
1192 private int firstFreeBlock;
\r
1193 private int index2NullOffset;
\r
1194 private boolean isCompacted;
\r
1198 * Multi-purpose per-data-block table.
\r
1200 * Before compacting:
\r
1202 * Per-data-block reference counters/free-block list.
\r
1204 * >0: reference counter (number of index-2 entries pointing here)
\r
1205 * <0: next free data block in free-block list
\r
1207 * While compacting:
\r
1209 * Map of adjusted indexes, used in compactData() and compactIndex2().
\r
1210 * Maps from original indexes to new ones.
\r
1212 private int[] map = new int[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2];
\r
1215 private boolean UTRIE2_DEBUG = false;
\r