2 ******************************************************************************
\r
4 * Copyright (C) 2009-2010, International Business Machines
\r
5 * Corporation and others. All Rights Reserved.
\r
7 ******************************************************************************
\r
10 package com.ibm.icu.impl;
\r
12 import com.ibm.icu.text.UnicodeSet.SpanCondition;
\r
15 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
\r
17 * Latin-1: Look up bytes. 2-byte characters: Bits organized vertically. 3-byte characters: Use zero/one/mixed data
\r
18 * per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. Supplementary characters: Call contains() on the
\r
21 public final class BMPSet {
\r
22 public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
\r
25 * One boolean ('true' or 'false') per Latin-1 character.
\r
27 private boolean[] latin1Contains;
\r
30 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
\r
31 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
\r
32 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
\r
34 * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
\r
37 private int[] table7FF;
\r
40 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
\r
41 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
\r
42 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
\r
43 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
\r
44 * and set.contains(c) must be called.
\r
46 * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
\r
47 * validity checking at runtime.
\r
49 private int[] bmpBlockBits;
\r
52 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
\r
53 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
\r
54 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
\r
56 private int[] list4kStarts;
\r
59 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
\r
60 * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
\r
62 private final int[] list;
\r
63 private final int listLength; // length used; list may be longer to minimize reallocs
\r
65 public BMPSet(final int[] parentList, int parentListLength) {
\r
67 listLength = parentListLength;
\r
68 latin1Contains = new boolean[0x100];
\r
69 table7FF = new int[64];
\r
70 bmpBlockBits = new int[64];
\r
71 list4kStarts = new int[18];
\r
74 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
\r
75 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
\r
76 * indexes is for finding supplementary code points.
\r
78 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
\r
80 for (i = 1; i <= 0x10; ++i) {
\r
81 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
\r
83 list4kStarts[0x11] = listLength - 1;
\r
88 public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
\r
89 list = newParentList;
\r
90 listLength = newParentListLength;
\r
91 latin1Contains = otherBMPSet.latin1Contains.clone();
\r
92 table7FF = otherBMPSet.table7FF.clone();
\r
93 bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
\r
94 list4kStarts = otherBMPSet.list4kStarts.clone();
\r
97 public boolean contains(int c) {
\r
99 return (latin1Contains[c]);
\r
100 } else if (c <= 0x7ff) {
\r
101 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
\r
102 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
\r
103 int lead = c >> 12;
\r
104 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
\r
105 if (twoBits <= 1) {
\r
106 // All 64 code points with the same bits 15..6
\r
107 // are either in the set or not.
\r
108 return (0 != twoBits);
\r
110 // Look up the code point in its 4k block of code points.
\r
111 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
\r
113 } else if (c <= 0x10ffff) {
\r
114 // surrogate or supplementary code point
\r
115 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
\r
117 // Out-of-range code points get false, consistent with long-standing
\r
118 // behavior of UnicodeSet.contains(c).
\r
124 * Span the initial substring for which each character c has spanCondition==contains(c). It must be
\r
125 * spanCondition==0 or 1.
\r
127 * @param start The start index
\r
128 * @param end The end index
\r
129 * @return The length of the span.
\r
131 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
\r
132 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
\r
135 public final int span(CharSequence s, int start, int end, SpanCondition spanCondition) {
\r
138 int limit = Math.min(s.length(), end);
\r
139 if (SpanCondition.NOT_CONTAINED != spanCondition) {
\r
141 while (i < limit) {
\r
144 if (!latin1Contains[c]) {
\r
147 } else if (c <= 0x7ff) {
\r
148 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
\r
151 } else if (c < 0xd800 ||
\r
152 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
\r
153 int lead = c >> 12;
\r
154 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
\r
155 if (twoBits <= 1) {
\r
156 // All 64 code points with the same bits 15..6
\r
157 // are either in the set or not.
\r
158 if (twoBits == 0) {
\r
162 // Look up the code point in its 4k block of code points.
\r
163 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
\r
169 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
\r
170 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
\r
179 while (i < limit) {
\r
182 if (latin1Contains[c]) {
\r
185 } else if (c <= 0x7ff) {
\r
186 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
\r
189 } else if (c < 0xd800 ||
\r
190 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
\r
191 int lead = c >> 12;
\r
192 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
\r
193 if (twoBits <= 1) {
\r
194 // All 64 code points with the same bits 15..6
\r
195 // are either in the set or not.
\r
196 if (twoBits != 0) {
\r
200 // Look up the code point in its 4k block of code points.
\r
201 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
\r
207 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
\r
208 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
\r
220 * Symmetrical with span().
\r
221 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
\r
222 * limit and spanCondition==0 or 1.
\r
224 * @return The string index which starts the span (i.e. inclusive).
\r
226 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
\r
229 limit = Math.min(s.length(), limit);
\r
230 if (SpanCondition.NOT_CONTAINED != spanCondition) {
\r
233 c = s.charAt(--limit);
\r
235 if (!latin1Contains[c]) {
\r
238 } else if (c <= 0x7ff) {
\r
239 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
\r
242 } else if (c < 0xd800 ||
\r
243 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
\r
244 int lead = c >> 12;
\r
245 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
\r
246 if (twoBits <= 1) {
\r
247 // All 64 code points with the same bits 15..6
\r
248 // are either in the set or not.
\r
249 if (twoBits == 0) {
\r
253 // Look up the code point in its 4k block of code points.
\r
254 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
\r
260 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
\r
261 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
\r
273 c = s.charAt(--limit);
\r
275 if (latin1Contains[c]) {
\r
278 } else if (c <= 0x7ff) {
\r
279 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
\r
282 } else if (c < 0xd800 ||
\r
283 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
\r
284 int lead = c >> 12;
\r
285 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
\r
286 if (twoBits <= 1) {
\r
287 // All 64 code points with the same bits 15..6
\r
288 // are either in the set or not.
\r
289 if (twoBits != 0) {
\r
293 // Look up the code point in its 4k block of code points.
\r
294 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
\r
300 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
\r
301 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
\r
315 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
\r
317 private static void set32x64Bits(int[] table, int start, int limit) {
\r
318 assert (64 == table.length);
\r
319 int lead = start >> 6;
\r
320 int trail = start & 0x3f;
\r
322 // Set one bit indicating an all-one block.
\r
323 int bits = 1 << lead;
\r
324 if ((start + 1) == limit) { // Single-character shortcut.
\r
325 table[trail] |= bits;
\r
329 int limitLead = limit >> 6;
\r
330 int limitTrail = limit & 0x3f;
\r
332 if (lead == limitLead) {
\r
333 // Partial vertical bit column.
\r
334 while (trail < limitTrail) {
\r
335 table[trail++] |= bits;
\r
338 // Partial vertical bit column,
\r
339 // followed by a bit rectangle,
\r
340 // followed by another partial vertical bit column.
\r
343 table[trail++] |= bits;
\r
344 } while (trail < 64);
\r
347 if (lead < limitLead) {
\r
348 bits = ~((1 << lead) - 1);
\r
349 if (limitLead < 0x20) {
\r
350 bits &= (1 << limitLead) - 1;
\r
352 for (trail = 0; trail < 64; ++trail) {
\r
353 table[trail] |= bits;
\r
356 bits = 1 << limitLead;
\r
357 for (trail = 0; trail < limitTrail; ++trail) {
\r
358 table[trail] |= bits;
\r
363 private void initBits() {
\r
367 // Set latin1Contains[].
\r
369 start = list[listIndex++];
\r
370 if (listIndex < listLength) {
\r
371 limit = list[listIndex++];
\r
375 if (start >= 0x100) {
\r
379 latin1Contains[start++] = true;
\r
380 } while (start < limit && start < 0x100);
\r
381 } while (limit <= 0x100);
\r
384 while (start < 0x800) {
\r
385 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
\r
386 if (limit > 0x800) {
\r
391 start = list[listIndex++];
\r
392 if (listIndex < listLength) {
\r
393 limit = list[listIndex++];
\r
399 // Set bmpBlockBits[].
\r
400 int minStart = 0x800;
\r
401 while (start < 0x10000) {
\r
402 if (limit > 0x10000) {
\r
406 if (start < minStart) {
\r
409 if (start < limit) { // Else: Another range entirely in a known mixed-value block.
\r
410 if (0 != (start & 0x3f)) {
\r
411 // Mixed-value block of 64 code points.
\r
413 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
\r
414 start = (start + 1) << 6; // Round up to the next block boundary.
\r
415 minStart = start; // Ignore further ranges in this block.
\r
417 if (start < limit) {
\r
418 if (start < (limit & ~0x3f)) {
\r
419 // Multiple all-ones blocks of 64 code points each.
\r
420 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
\r
423 if (0 != (limit & 0x3f)) {
\r
424 // Mixed-value block of 64 code points.
\r
426 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
\r
427 limit = (limit + 1) << 6; // Round up to the next block boundary.
\r
428 minStart = limit; // Ignore further ranges in this block.
\r
433 if (limit == 0x10000) {
\r
437 start = list[listIndex++];
\r
438 if (listIndex < listLength) {
\r
439 limit = list[listIndex++];
\r
448 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
\r
449 * points in a certain range.
\r
451 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
\r
452 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
\r
455 * a character in a subrange of MIN_VALUE..MAX_VALUE
\r
457 * The lowest index to be returned.
\r
459 * The highest index to be returned.
\r
460 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
\r
462 private int findCodePoint(int c, int lo, int hi) {
\r
465 set list[] c=0 1 3 4 7 8
\r
466 === ============== ===========
\r
467 [] [110000] 0 0 0 0 0 0
\r
468 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
\r
469 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
\r
470 [:Any:] [0, 110000] 1 1 1 1 1 1
\r
473 // Return the smallest i such that c < list[i]. Assume
\r
474 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
\r
477 // High runner test. c is often after the last range, so an
\r
478 // initial check for this condition pays off.
\r
479 if (lo >= hi || c >= list[hi - 1])
\r
481 // invariant: c >= list[lo]
\r
482 // invariant: c < list[hi]
\r
484 int i = (lo + hi) >> 1;
\r
487 } else if (c < list[i]) {
\r
496 private final boolean containsSlow(int c, int lo, int hi) {
\r
497 return (0 != (findCodePoint(c, lo, hi) & 1));
\r