2 *******************************************************************************
3 * Copyright (C) 2009, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.impl;
12 * A Trie2Writable is a modifiable, or build-time Trie2.
13 * Functions for reading data from the Trie are all from class Trie2.
16 public class Trie2Writable extends Trie2 {
20 * Create a new, empty, writable Trie2. 32-bit data values are used.
22 * @param initialValueP the initial value that is set for all code points
23 * @param errorValueP the value for out-of-range code points and illegal UTF-8
25 public Trie2Writable(int initialValueP, int errorValueP) {
26 // This constructor corresponds to utrie2_open() in ICU4C.
27 init(initialValueP, errorValueP);
31 private void init(int initialValueP, int errorValueP) {
32 this.initialValue = initialValueP;
33 this.errorValue = errorValueP;
34 this.highStart = 0x110000;
36 this.data = new int[UNEWTRIE2_INITIAL_DATA_LENGTH];
37 this.dataCapacity = UNEWTRIE2_INITIAL_DATA_LENGTH;
38 this.initialValue = initialValueP;
39 this.errorValue = errorValueP;
40 this.highStart = 0x110000;
41 this.firstFreeBlock = 0; /* no free block in the list */
42 this.isCompacted = false;
45 * preallocate and reset
47 * - the bad-UTF-8-data block
48 * - the null data block
51 for(i=0; i<0x80; ++i) {
52 data[i] = initialValue;
57 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
58 data[i] = initialValue;
60 dataNullOffset = UNEWTRIE2_DATA_NULL_OFFSET;
61 dataLength = UNEWTRIE2_DATA_START_OFFSET;
63 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
64 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
69 /* reference counts for the bad-UTF-8-data block */
70 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
75 * Reference counts for the null data block: all blocks except for the ASCII blocks.
76 * Plus 1 so that we don't drop this block during compaction.
77 * Plus as many as needed for lead surrogate code points.
79 /* i==newTrie->dataNullOffset */
81 (0x110000>>UTRIE2_SHIFT_2) -
82 (0x80>>UTRIE2_SHIFT_2) +
84 UTRIE2_LSCP_INDEX_2_LENGTH;
85 j += UTRIE2_DATA_BLOCK_LENGTH;
86 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
91 * set the remaining indexes in the BMP index-2 block
92 * to the null data block
94 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
95 index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
99 * Fill the index gap with impossible values so that compaction
100 * does not overlap other index-2 blocks with the gap.
102 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
103 index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
106 /* set the indexes in the null index-2 block */
107 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
108 index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
110 index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
111 index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
113 /* set the index-1 indexes for the linear index-2 block */
115 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
116 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
121 /* set the remaining index-1 indexes to the null index-2 block */
122 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
123 index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
127 * Preallocate and reset data for U+0080..U+07ff,
128 * for 2-byte UTF-8 which will be compacted in 64-blocks
129 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
131 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
132 set(i, initialValue);
139 * Create a new build time (modifiable) Trie2 whose contents are the same as the source Trie2.
141 * @param source the source Trie2. Its contents will be copied into the new Trie2.
143 public Trie2Writable(Trie2 source) {
144 init(source.initialValue, source.errorValue);
146 for (Range r: source) {
152 private boolean isInNullBlock(int c, boolean forLSCP) {
155 if(Character.isHighSurrogate((char)c) && forLSCP) {
156 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
159 i2=index1[c>>UTRIE2_SHIFT_1]+
160 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
163 return (block==dataNullOffset);
166 private int allocIndex2Block() {
167 int newBlock, newTop;
169 newBlock=index2Length;
170 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
171 if(newTop > index2.length) {
172 throw new IllegalStateException("Internal error in Trie2 creation.");
174 * Should never occur.
175 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
176 * or the code writes more values than should be possible.
180 System.arraycopy(index2, index2NullOffset, index2, newBlock, UTRIE2_INDEX_2_BLOCK_LENGTH);
184 private int getIndex2Block(int c, boolean forLSCP) {
187 if(c>=0xd800 && c<0xdc00 && forLSCP) {
188 return UTRIE2_LSCP_INDEX_2_OFFSET;
191 i1=c>>UTRIE2_SHIFT_1;
193 if(i2==index2NullOffset) {
194 i2=allocIndex2Block();
200 private int allocDataBlock(int copyBlock) {
201 int newBlock, newTop;
203 if(firstFreeBlock!=0) {
204 /* get the first free block */
205 newBlock=firstFreeBlock;
206 firstFreeBlock=-map[newBlock>>UTRIE2_SHIFT_2];
208 /* get a new block from the high end */
210 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
211 if(newTop>dataCapacity) {
212 /* out of memory in the data array */
216 if(dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
217 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
218 } else if(dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
219 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
222 * Should never occur.
223 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
224 * or the code writes more values than should be possible.
226 throw new IllegalStateException("Internal error in Trie2 creation.");
228 newData = new int[capacity];
229 System.arraycopy(data, 0, newData, 0, dataLength);
231 dataCapacity=capacity;
235 System.arraycopy(data, copyBlock, data, newBlock, UTRIE2_DATA_BLOCK_LENGTH);
236 map[newBlock>>UTRIE2_SHIFT_2]=0;
241 /* call when the block's reference counter reaches 0 */
242 private void releaseDataBlock(int block) {
243 /* put this block at the front of the free-block chain */
244 map[block>>UTRIE2_SHIFT_2]=-firstFreeBlock;
245 firstFreeBlock=block;
249 private boolean isWritableBlock(int block) {
250 return (block!=dataNullOffset && 1==map[block>>UTRIE2_SHIFT_2]);
253 private void setIndex2Entry(int i2, int block) {
255 ++map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
257 if(0 == --map[oldBlock>>UTRIE2_SHIFT_2]) {
258 releaseDataBlock(oldBlock);
265 * No error checking for illegal arguments.
269 private int getDataBlock(int c, boolean forLSCP) {
270 int i2, oldBlock, newBlock;
272 i2=getIndex2Block(c, forLSCP);
274 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
276 if(isWritableBlock(oldBlock)) {
280 /* allocate a new data block */
281 newBlock=allocDataBlock(oldBlock);
282 setIndex2Entry(i2, newBlock);
286 * Set a value for a code point.
288 * @param c the code point
289 * @param value the value
291 public Trie2Writable set(int c, int value) {
292 if (c<0 || c>0x10ffff) {
293 throw new IllegalArgumentException("Invalid code point.");
300 private Trie2Writable set(int c, boolean forLSCP, int value) {
305 block = getDataBlock(c, forLSCP);
306 data[block + (c&UTRIE2_DATA_MASK)] = value;
312 * Uncompact a compacted Trie2Writable.
313 * This is needed if a the WritableTrie2 was compacted in preparation for creating a read-only
314 * Trie2, and then is subsequently altered.
316 * The structure is a bit awkward - it would be cleaner to leave the original
317 * Trie2 unaltered - but compacting in place was taken directly from the ICU4C code.
319 * The approach is to create a new (uncompacted) Trie2Writable from this one, then transfer
320 * the guts from the new to the old.
322 private void uncompact() {
323 Trie2Writable tempTrie = new Trie2Writable(this);
325 // Members from Trie2Writable
326 this.index1 = tempTrie.index1;
327 this.index2 = tempTrie.index2;
328 this.data = tempTrie.data;
329 this.index2Length = tempTrie.index2Length;
330 this.dataCapacity = tempTrie.dataCapacity;
331 this.isCompacted = tempTrie.isCompacted;
333 // Members From Trie2
334 this.header = tempTrie.header;
335 this.index = tempTrie.index;
336 this.data16 = tempTrie.data16;
337 this.data32 = tempTrie.data32;
338 this.indexLength = tempTrie.indexLength;
339 this.dataLength = tempTrie.dataLength;
340 this.index2NullOffset = tempTrie.index2NullOffset;
341 this.initialValue = tempTrie.initialValue;
342 this.errorValue = tempTrie.errorValue;
343 this.highStart = tempTrie.highStart;
344 this.highValueIndex = tempTrie.highValueIndex;
345 this.dataNullOffset = tempTrie.dataNullOffset;
349 private void writeBlock(int block, int value) {
350 int limit=block+UTRIE2_DATA_BLOCK_LENGTH;
357 * initialValue is ignored if overwrite=TRUE
360 private void fillBlock(int block, /*UChar32*/ int start, /*UChar32*/ int limit,
361 int value, int initialValue, boolean overwrite) {
363 int pLimit = block+limit;
365 for (i=block+start; i<pLimit; i++) {
369 for (i=block+start; i<pLimit; i++) {
370 if(data[i]==initialValue) {
378 * Set a value in a range of code points [start..end].
379 * All code points c with start<=c<=end will get the value if
380 * overwrite is TRUE or if the old value is the initial value.
382 * @param start the first code point to get the value
383 * @param end the last code point to get the value (inclusive)
384 * @param value the value
385 * @param overwrite flag for whether old non-initial values are to be overwritten
387 public Trie2Writable setRange(int start, int end,
388 int value, boolean overwrite) {
390 * repeat value in [start..end]
391 * mark index values for repeat-data blocks by setting bit 31 of the index values
392 * fill around existing values if any, if(overwrite)
394 int block, rest, repeatBlock;
395 int /*UChar32*/ limit;
397 if(start>0x10ffff || start<0 || end>0x10ffff || end<0 || start>end) {
398 throw new IllegalArgumentException("Invalid code point range.");
400 if(!overwrite && value==initialValue) {
401 return this; /* nothing to do */
409 if((start&UTRIE2_DATA_MASK) != 0) {
410 int /*UChar32*/ nextStart;
412 /* set partial block at [start..following block boundary[ */
413 block=getDataBlock(start, true);
415 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
416 if(nextStart<=limit) {
417 fillBlock(block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
418 value, initialValue, overwrite);
421 fillBlock(block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
422 value, initialValue, overwrite);
427 /* number of positions in the last, partial block */
428 rest=limit&UTRIE2_DATA_MASK;
430 /* round down limit to a block boundary */
431 limit&=~UTRIE2_DATA_MASK;
433 /* iterate over all-value blocks */
434 if(value==initialValue) {
435 repeatBlock=dataNullOffset;
442 boolean setRepeatBlock=false;
444 if(value==initialValue && isInNullBlock(start, true)) {
445 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
449 /* get index value */
450 i2=getIndex2Block(start, true);
451 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
453 if(isWritableBlock(block)) {
454 /* already allocated */
455 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
457 * We overwrite all values, and it's not a
458 * protected (ASCII-linear or 2-byte UTF-8) block:
459 * replace with the repeatBlock.
463 /* !overwrite, or protected block: just write the values into this block */
465 0, UTRIE2_DATA_BLOCK_LENGTH,
466 value, initialValue, overwrite);
468 } else if(data[block]!=value && (overwrite || block==dataNullOffset)) {
470 * Set the repeatBlock instead of the null block or previous repeat block:
472 * If !isWritableBlock() then all entries in the block have the same value
473 * because it's the null block or a range block (the repeatBlock from a previous
474 * call to utrie2_setRange32()).
475 * No other blocks are used multiple times before compacting.
477 * The null block is the only non-writable block with the initialValue because
478 * of the repeatBlock initialization above. (If value==initialValue, then
479 * the repeatBlock will be the null data block.)
481 * We set our repeatBlock if the desired value differs from the block's value,
482 * and if we overwrite any data or if the data is all initial values
483 * (which is the same as the block being the null block, see above).
489 setIndex2Entry(i2, repeatBlock);
491 /* create and set and fill the repeatBlock */
492 repeatBlock=getDataBlock(start, true);
493 writeBlock(repeatBlock, value);
497 start+=UTRIE2_DATA_BLOCK_LENGTH;
501 /* set partial block at [last block boundary..limit[ */
502 block=getDataBlock(start, true);
503 fillBlock(block, 0, rest, value, initialValue, overwrite);
510 * Set the values from a Trie2.Range.
512 * All code points within the range will get the value if
513 * overwrite is TRUE or if the old value is the initial value.
515 * Ranges with the lead surrogate flag set will set the alternate
516 * lead-surrogate values in the Trie, rather than the code point values.
518 * This function is intended to work with the ranges produced when iterating
519 * the contents of a source Trie.
521 * @param range contains the range of code points and the value to be set.
522 * @param overwrite flag for whether old non-initial values are to be overwritten
524 public Trie2Writable setRange(Trie2.Range range, boolean overwrite) {
526 if (range.leadSurrogate) {
527 for (int c=range.startCodePoint; c<=range.endCodePoint; c++) {
528 if (overwrite || getFromU16SingleLead((char)c) == this.initialValue) {
529 setForLeadSurrogateCodeUnit((char)c, range.value);
533 setRange(range.startCodePoint, range.endCodePoint, range.value, overwrite);
539 * Set a value for a UTF-16 code unit.
540 * Note that a Trie2 stores separate values for
541 * supplementary code points in the lead surrogate range
542 * (accessed via the plain set() and get() interfaces)
543 * and for lead surrogate code units.
545 * The lead surrogate code unit values are set via this function and
546 * read by the function getFromU16SingleLead().
548 * For code units outside of the lead surrogate range, this function
549 * behaves identically to set().
551 * @param codeUnit A UTF-16 code unit.
552 * @param value the value to be stored in the Trie2.
554 public Trie2Writable setForLeadSurrogateCodeUnit(char codeUnit, int value) {
556 set(codeUnit, false, value);
562 * Get the value for a code point as stored in the Trie2.
564 * @param codePoint the code point
568 public int get(int codePoint) {
569 if (codePoint<0 || codePoint>0x10ffff) {
572 return get(codePoint, true);
577 private int get(int c, boolean fromLSCP) {
580 if(c>=highStart && (!(c>=0xd800 && c<0xdc00) || fromLSCP)) {
581 return data[dataLength-UTRIE2_DATA_GRANULARITY];
584 if((c>=0xd800 && c<0xdc00) && fromLSCP) {
585 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
588 i2=index1[c>>UTRIE2_SHIFT_1]+
589 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
592 return data[block+(c&UTRIE2_DATA_MASK)];
596 * Get a trie value for a UTF-16 code unit.
598 * This function returns the same value as get() if the input
599 * character is outside of the lead surrogate range
601 * There are two values stored in a Trie for inputs in the lead
602 * surrogate range. This function returns the alternate value,
603 * while Trie2.get() returns the main value.
605 * @param c the code point or lead surrogate value.
609 public int getFromU16SingleLead(char c) {
610 return get(c, false);
613 /* compaction --------------------------------------------------------------- */
615 private boolean equal_int(int[] a, int s, int t, int length) {
616 for (int i=0; i<length; i++) {
617 if (a[s+i] != a[t+i]) {
625 private int findSameIndex2Block(int index2Length, int otherBlock) {
628 /* ensure that we do not even partially get past index2Length */
629 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
631 for(block=0; block<=index2Length; ++block) {
632 if(equal_int(index2, block, otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
640 private int findSameDataBlock(int dataLength, int otherBlock, int blockLength) {
643 /* ensure that we do not even partially get past dataLength */
644 dataLength-=blockLength;
646 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
647 if(equal_int(data, block, otherBlock, blockLength)) {
655 * Find the start of the last range in the trie by enumerating backward.
656 * Indexes for supplementary code points higher than this will be omitted.
658 private int findHighStart(int highValue) {
662 int i1, i2, j, i2Block, prevI2Block, block, prevBlock;
665 /* set variables for previous range */
666 if(highValue==initialValue) {
667 prevI2Block=index2NullOffset;
668 prevBlock=dataNullOffset;
675 /* enumerate index-2 blocks */
676 i1=UNEWTRIE2_INDEX_1_LENGTH;
679 i2Block=index1[--i1];
680 if(i2Block==prevI2Block) {
681 /* the index-2 block is the same as the previous one, and filled with highValue */
682 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
686 if(i2Block==index2NullOffset) {
687 /* this is the null index-2 block */
688 if(highValue!=initialValue) {
691 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
693 /* enumerate data blocks for one index-2 block */
694 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
695 block=index2[i2Block+ --i2];
696 if(block==prevBlock) {
697 /* the block is the same as the previous one, and filled with highValue */
698 c-=UTRIE2_DATA_BLOCK_LENGTH;
702 if(block==dataNullOffset) {
703 /* this is the null data block */
704 if(highValue!=initialValue) {
707 c-=UTRIE2_DATA_BLOCK_LENGTH;
709 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
710 value=data[block+ --j];
711 if(value!=highValue) {
721 /* deliver last range */
726 * Compact a build-time trie.
729 * - removes blocks that are identical with earlier ones
730 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
731 * - moves blocks in steps of the data granularity
732 * - moves and overlaps blocks that overlap with multiple values in the overlap region
735 * - try to move and overlap blocks that are not already adjacent
737 private void compactData() {
738 int start, newStart, movedStart;
739 int blockLength, overlap;
740 int i, mapIndex, blockCount;
742 /* do not compact linear-ASCII data */
743 newStart=UTRIE2_DATA_START_OFFSET;
744 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
749 * Start with a block length of 64 for 2-byte UTF-8,
750 * then switch to UTRIE2_DATA_BLOCK_LENGTH.
753 blockCount=blockLength>>UTRIE2_SHIFT_2;
754 for(start=newStart; start<dataLength;) {
756 * start: index of first entry of current block
757 * newStart: index where the current block is to be moved
758 * (right after current end of already-compacted data)
760 if(start==UNEWTRIE2_DATA_0800_OFFSET) {
761 blockLength=UTRIE2_DATA_BLOCK_LENGTH;
765 /* skip blocks that are not used */
766 if(map[start>>UTRIE2_SHIFT_2]<=0) {
767 /* advance start to the next block */
770 /* leave newStart with the previous block! */
774 /* search for an identical block */
775 movedStart=findSameDataBlock(newStart, start, blockLength);
776 if(movedStart >= 0) {
777 /* found an identical block, set the other block's index value for the current block */
778 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
779 map[mapIndex++]=movedStart;
780 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
783 /* advance start to the next block */
786 /* leave newStart with the previous block! */
790 /* see if the beginning of this block can be overlapped with the end of the previous block */
791 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
792 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
793 overlap>0 && !equal_int(data, (newStart-overlap), start, overlap);
794 overlap-=UTRIE2_DATA_GRANULARITY) {}
796 if(overlap>0 || newStart<start) {
797 /* some overlap, or just move the whole block */
798 movedStart=newStart-overlap;
799 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
800 map[mapIndex++]=movedStart;
801 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
804 /* move the non-overlapping indexes to their new positions */
806 for(i=blockLength-overlap; i>0; --i) {
807 data[newStart++]=data[start++];
809 } else /* no overlap && newStart==start */ {
810 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
811 map[mapIndex++]=start;
812 start+=UTRIE2_DATA_BLOCK_LENGTH;
818 /* now adjust the index-2 table */
819 for(i=0; i<index2Length; ++i) {
820 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
821 /* Gap indexes are invalid (-1). Skip over the gap. */
822 i+=UNEWTRIE2_INDEX_GAP_LENGTH;
824 index2[i]=map[index2[i]>>UTRIE2_SHIFT_2];
826 dataNullOffset=map[dataNullOffset>>UTRIE2_SHIFT_2];
828 /* ensure dataLength alignment */
829 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
830 data[newStart++]=initialValue;
834 /* we saved some space */
835 System.out.printf("compacting UTrie2: count of 32-bit data words %d->%d\n",
836 dataLength, newStart);
842 private void compactIndex2() {
843 int i, start, newStart, movedStart, overlap;
845 /* do not compact linear-BMP index-2 blocks */
846 newStart=UTRIE2_INDEX_2_BMP_LENGTH;
847 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
851 /* Reduce the index table gap to what will be needed at runtime. */
852 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((highStart-0x10000)>>UTRIE2_SHIFT_1);
854 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<index2Length;) {
856 * start: index of first entry of current block
857 * newStart: index where the current block is to be moved
858 * (right after current end of already-compacted data)
861 /* search for an identical block */
862 if( (movedStart=findSameIndex2Block(newStart, start))
865 /* found an identical block, set the other block's index value for the current block */
866 map[start>>UTRIE2_SHIFT_1_2]=movedStart;
868 /* advance start to the next block */
869 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
871 /* leave newStart with the previous block! */
875 /* see if the beginning of this block can be overlapped with the end of the previous block */
876 /* look for maximum overlap with the previous, adjacent block */
877 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
878 overlap>0 && !equal_int(index2, newStart-overlap, start, overlap);
881 if(overlap>0 || newStart<start) {
882 /* some overlap, or just move the whole block */
883 map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
885 /* move the non-overlapping indexes to their new positions */
887 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
888 index2[newStart++]=index2[start++];
890 } else /* no overlap && newStart==start */ {
891 map[start>>UTRIE2_SHIFT_1_2]=start;
892 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
897 /* now adjust the index-1 table */
898 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
899 index1[i]=map[index1[i]>>UTRIE2_SHIFT_1_2];
901 index2NullOffset=map[index2NullOffset>>UTRIE2_SHIFT_1_2];
904 * Ensure data table alignment:
905 * Needs to be granularity-aligned for 16-bit trie
906 * (so that dataMove will be down-shiftable),
907 * and 2-aligned for uint32_t data.
909 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
910 /* Arbitrary value: 0x3fffc not possible for real data. */
911 index2[newStart++]=0x0000ffff<<UTRIE2_INDEX_SHIFT;
915 /* we saved some space */
916 System.out.printf("compacting UTrie2: count of 16-bit index-2 words %d->%d\n",
917 index2Length, newStart);
920 index2Length=newStart;
923 private void compactTrie() {
928 /* find highStart and round it up */
929 highValue=get(0x10ffff);
930 localHighStart=findHighStart(highValue);
931 localHighStart=(localHighStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
932 if(localHighStart==0x110000) {
933 highValue=errorValue;
937 * Set trie->highStart only after utrie2_get32(trie, highStart).
938 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
940 this.highStart=localHighStart;
943 System.out.printf("UTrie2: highStart U+%04x highValue 0x%x initialValue 0x%x\n",
944 highStart, highValue, initialValue);
947 if(highStart<0x110000) {
948 /* Blank out [highStart..10ffff] to release associated data blocks. */
949 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
950 setRange(suppHighStart, 0x10ffff, initialValue, true);
954 if(highStart>0x10000) {
958 System.out.printf("UTrie2: highStart U+%04x count of 16-bit index-2 words %d->%d\n",
959 highStart, index2Length, UTRIE2_INDEX_1_OFFSET);
964 * Store the highValue in the data array and round up the dataLength.
965 * Must be done after compactData() because that assumes that dataLength
966 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
968 data[dataLength++]=highValue;
969 while((dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
970 data[dataLength++]=initialValue;
978 * Produce an optimized, read-only Trie2_16 from this writable Trie.
979 * The data values outside of the range that will fit in a 16 bit
980 * unsigned value will be truncated.
982 public Trie2_16 toTrie2_16() {
983 Trie2_16 frozenTrie = new Trie2_16();
984 freeze(frozenTrie, ValueWidth.BITS_16);
990 * Produce an optimized, read-only Trie2_32 from this writable Trie.
993 public Trie2_32 toTrie2_32() {
994 Trie2_32 frozenTrie = new Trie2_32();
995 freeze(frozenTrie, ValueWidth.BITS_32);
1001 * Maximum length of the runtime index array.
1002 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1003 * (The actual maximum length is lower,
1004 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1006 private static final int UTRIE2_MAX_INDEX_LENGTH = 0xffff;
1009 * Maximum length of the runtime data array.
1010 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1011 * and by uint16_t UTrie2Header.shiftedDataLength.
1013 private static final int UTRIE2_MAX_DATA_LENGTH = 0xffff<<UTRIE2_INDEX_SHIFT;
1015 /* Compact the data and then populate an optimized read-only Trie. */
1016 private void freeze(Trie2 dest, ValueWidth valueBits) {
1018 int allIndexesLength;
1019 int dataMove; /* >0 if the data is moved to the end of the index array */
1022 /* compact if necessary */
1027 if(highStart<=0x10000) {
1028 allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1030 allIndexesLength=index2Length;
1032 if(valueBits==ValueWidth.BITS_16) {
1033 dataMove=allIndexesLength;
1038 /* are indexLength and dataLength within limits? */
1039 if( /* for unshifted indexLength */
1040 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1041 /* for unshifted dataNullOffset */
1042 (dataMove+dataNullOffset)>0xffff ||
1043 /* for unshifted 2-byte UTF-8 index-2 values */
1044 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1045 /* for shiftedDataLength */
1046 (dataMove+dataLength)>UTRIE2_MAX_DATA_LENGTH) {
1047 throw new UnsupportedOperationException("Trie2 data is too large.");
1050 /* calculate the sizes of, and allocate, the index and data arrays */
1051 int indexLength = allIndexesLength;
1052 if (valueBits==ValueWidth.BITS_16) {
1053 indexLength += dataLength;
1055 dest.data32 = new int[dataLength];
1057 dest.index = new char[indexLength];
1059 dest.indexLength = allIndexesLength;
1060 dest.dataLength = dataLength;
1061 if(highStart<=0x10000) {
1062 dest.index2NullOffset = 0xffff;
1064 dest.index2NullOffset = UTRIE2_INDEX_2_OFFSET + index2NullOffset;
1066 dest.initialValue = initialValue;
1067 dest.errorValue = errorValue;
1068 dest.highStart = highStart;
1069 dest.highValueIndex = dataMove + dataLength - UTRIE2_DATA_GRANULARITY;
1070 dest.dataNullOffset = (dataMove+dataNullOffset);
1072 // Create a header and set the its fields.
1073 // (This is only used in the event that we serialize the Trie, but is
1074 // convenient to do here.)
1075 dest.header = new Trie2.UTrie2Header();
1076 dest.header.signature = 0x54726932; /* "Tri2" */
1077 dest.header.options = valueBits==ValueWidth.BITS_16 ? 0 : 1;
1078 dest.header.indexLength = dest.indexLength;
1079 dest.header.shiftedDataLength = dest.dataLength>>UTRIE2_INDEX_SHIFT;
1080 dest.header.index2NullOffset = dest.index2NullOffset;
1081 dest.header.dataNullOffset = dest.dataNullOffset;
1082 dest.header.shiftedHighStart = dest.highStart>>UTRIE2_SHIFT_1;
1086 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1088 for(i=0; i<UTRIE2_INDEX_2_BMP_LENGTH; i++) {
1089 dest.index[destIdx++] = (char)((index2[i]+dataMove) >> UTRIE2_INDEX_SHIFT);
1092 System.out.println("\n\nIndex2 for BMP limit is " + Integer.toHexString(destIdx));
1095 /* write UTF-8 2-byte index-2 values, not right-shifted */
1096 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
1097 dest.index[destIdx++] = (char)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1099 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
1100 dest.index[destIdx++]=(char)(dataMove+index2[i<<(6-UTRIE2_SHIFT_2)]);
1103 System.out.println("Index2 for UTF-8 2byte values limit is " + Integer.toHexString(destIdx));
1106 if(highStart>0x10000) {
1107 int index1Length = (highStart-0x10000)>>UTRIE2_SHIFT_1;
1108 int index2Offset = UTRIE2_INDEX_2_BMP_LENGTH + UTRIE2_UTF8_2B_INDEX_2_LENGTH + index1Length;
1110 /* write 16-bit index-1 values for supplementary code points */
1111 //p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1112 for(i=0; i<index1Length; i++) {
1113 //*dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1114 dest.index[destIdx++] = (char)(UTRIE2_INDEX_2_OFFSET + index1[i+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH]);
1117 System.out.println("Index 1 for supplementals, limit is " + Integer.toHexString(destIdx));
1121 * write the index-2 array values for supplementary code points,
1122 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1124 for(i=0; i<index2Length-index2Offset; i++) {
1125 dest.index[destIdx++] = (char)((dataMove + index2[index2Offset+i])>>UTRIE2_INDEX_SHIFT);
1128 System.out.println("Index 2 for supplementals, limit is " + Integer.toHexString(destIdx));
1132 /* write the 16/32-bit data array */
1135 /* write 16-bit data values */
1136 assert(destIdx == dataMove);
1137 dest.data16 = destIdx;
1138 for(i=0; i<dataLength; i++) {
1139 dest.index[destIdx++] = (char)data[i];
1143 /* write 32-bit data values */
1144 for (i=0; i<dataLength; i++) {
1145 dest.data32[i] = this.data[i];
1149 // The writable, but compressed, Trie2 stays around unless the caller drops its references to it.
1153 /* Start with allocation of 16k data entries. */
1154 private static final int UNEWTRIE2_INITIAL_DATA_LENGTH = 1<<14;
1156 /* Grow about 8x each time. */
1157 private static final int UNEWTRIE2_MEDIUM_DATA_LENGTH = 1<<17;
1159 /** The null index-2 block, following the gap in the index-2 table. */
1160 private static final int UNEWTRIE2_INDEX_2_NULL_OFFSET = UNEWTRIE2_INDEX_GAP_OFFSET + UNEWTRIE2_INDEX_GAP_LENGTH;
1162 /** The start of allocated index-2 blocks. */
1163 private static final int UNEWTRIE2_INDEX_2_START_OFFSET = UNEWTRIE2_INDEX_2_NULL_OFFSET + UTRIE2_INDEX_2_BLOCK_LENGTH;
1166 * The null data block.
1167 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
1168 * to work with 6-bit trail bytes from 2-byte UTF-8.
1170 private static final int UNEWTRIE2_DATA_NULL_OFFSET = UTRIE2_DATA_START_OFFSET;
1172 /** The start of allocated data blocks. */
1173 private static final int UNEWTRIE2_DATA_START_OFFSET = UNEWTRIE2_DATA_NULL_OFFSET+0x40;
1176 * The start of data blocks for U+0800 and above.
1177 * Below, compaction uses a block length of 64 for 2-byte UTF-8.
1178 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
1179 * Data values for 0x780 code points beyond ASCII.
1181 private static final int UNEWTRIE2_DATA_0800_OFFSET = UNEWTRIE2_DATA_START_OFFSET+0x780;
1184 // Private data members. From struct UNewTrie2 in ICU4C
1186 private int[] index1 = new int[UNEWTRIE2_INDEX_1_LENGTH];
1187 private int[] index2 = new int[UNEWTRIE2_MAX_INDEX_2_LENGTH];
1190 private int index2Length;
1191 private int dataCapacity;
1192 private int firstFreeBlock;
1193 private int index2NullOffset;
1194 private boolean isCompacted;
1198 * Multi-purpose per-data-block table.
1200 * Before compacting:
1202 * Per-data-block reference counters/free-block list.
1204 * >0: reference counter (number of index-2 entries pointing here)
1205 * <0: next free data block in free-block list
1209 * Map of adjusted indexes, used in compactData() and compactIndex2().
1210 * Maps from original indexes to new ones.
1212 private int[] map = new int[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2];
1215 private boolean UTRIE2_DEBUG = false;