2 *******************************************************************************
3 * Copyright (C) 2004-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.impl;
10 * For generation of Implicit CEs
13 * Cleaned up so that changes can be made more easily.
15 # First Implicit: E26A792D
16 # Last Implicit: E3DC70C0
19 # First CJK_A: E0A9DF00
20 # Last CJK_A: E0DE3100
23 public class ImplicitCEGenerator {
28 static final boolean DEBUG = false;
30 static final long topByte = 0xFF000000L;
31 static final long bottomByte = 0xFFL;
32 static final long fourBytes = 0xFFFFFFFFL;
34 static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
36 // public static final int CJK_BASE = 0x4E00;
37 // public static final int CJK_LIMIT = 0x9FFF+1;
38 // public static final int CJK_COMPAT_USED_BASE = 0xFA0E;
39 // public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1;
40 // public static final int CJK_A_BASE = 0x3400;
41 // public static final int CJK_A_LIMIT = 0x4DBF+1;
42 // public static final int CJK_B_BASE = 0x20000;
43 // public static final int CJK_B_LIMIT = 0x2A6DF+1;
45 public static final int
46 // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
47 // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; (Unicode 6.1)
51 CJK_COMPAT_USED_BASE = 0xFA0E,
52 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
54 //3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
55 //4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
58 CJK_A_LIMIT = 0x4DB5+1,
60 //20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
61 //2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
64 CJK_B_LIMIT = 0x2A6D6+1,
66 //2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
67 //2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
70 CJK_C_LIMIT = 0x2B734+1,
72 //2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
73 //2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
76 CJK_D_LIMIT = 0x2B81D+1
78 // when adding to this list, look for all occurrences (in project) of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
81 // private void throwError(String title, int cp) {
82 // throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" +
83 // Utility.hex(getImplicitFromRaw(cp) & fourBytes));
86 // private void throwError(String title, long ce) {
87 // throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
90 // private void show(int i) {
91 // if (i >= 0 && i <= MAX_INPUT) {
92 // System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
97 * Precomputed by constructor
100 int final4Multiplier;
113 public int getGap4() {
114 return final4Multiplier - 1;
117 public int getGap3() {
118 return final3Multiplier - 1;
122 // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
123 // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
124 // we shift so that HAN all has the same first primary, for compression.
125 // for the 4 byte case, we make the gap as large as we can fit.
128 * Supply parameters for generating implicit CEs
130 public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
131 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
132 this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
136 * Set up to generate implicits.
137 * @param minPrimary The minimum primary value.
138 * @param maxPrimary The maximum primary value.
139 * @param minTrail final byte
140 * @param maxTrail final byte
141 * @param gap3 the gap we leave for tailoring for 3-byte forms
142 * @param primaries3count number of 3-byte primarys we can use (normally 1)
144 public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
145 // some simple parameter checks
146 if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
147 throw new IllegalArgumentException("bad lead bytes");
149 if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
150 throw new IllegalArgumentException("bad trail bytes");
152 if (primaries3count < 1) {
153 throw new IllegalArgumentException("bad three-byte primaries");
156 this.minTrail = minTrail;
157 this.maxTrail = maxTrail;
159 min3Primary = minPrimary;
160 max4Primary = maxPrimary;
161 // compute constants for use later.
162 // number of values we can use in trailing bytes
163 // leave room for empty values between AND above, e.g. if gap = 2
164 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
165 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
166 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
167 final3Multiplier = gap3 + 1;
168 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
169 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
171 // medials can use full range
172 medialCount = (maxTrail - minTrail + 1);
173 // find out how many values fit in each form
174 int threeByteCount = medialCount * final3Count;
175 // now determine where the 3/4 boundary is.
176 // we use 3 bytes below the boundary, and 4 above
177 int primariesAvailable = maxPrimary - minPrimary + 1;
178 int primaries4count = primariesAvailable - primaries3count;
180 int min3ByteCoverage = primaries3count * threeByteCount;
181 min4Primary = minPrimary + primaries3count;
182 min4Boundary = min3ByteCoverage;
183 // Now expand out the multiplier for the 4 bytes, and redo.
185 int totalNeeded = MAX_INPUT - min4Boundary;
186 int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
187 if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
189 int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
190 if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
192 int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
193 if (DEBUG) System.out.println("expandedGap: " + gap4);
194 if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
196 final4Multiplier = gap4 + 1;
197 final4Count = neededPerFinalByte;
198 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
200 if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
201 throw new IllegalArgumentException("internal error");
204 System.out.println("final4Count: " + final4Count);
205 for (int counter = 0; counter < final4Count; ++counter) {
206 int value = minTrail + (1 + counter)*final4Multiplier;
207 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
212 static public int divideAndRoundUp(int a, int b) {
217 * Converts implicit CE into raw integer
218 * @param implicit The implicit value passed.
219 * @return -1 if illegal format
221 public int getRawFromImplicit(int implicit) {
223 int b3 = implicit & 0xFF;
225 int b2 = implicit & 0xFF;
227 int b1 = implicit & 0xFF;
229 int b0 = implicit & 0xFF;
231 // simple parameter checks
232 if (b0 < min3Primary || b0 > max4Primary
233 || b1 < minTrail || b1 > maxTrail) return -1;
237 // take care of the final values, and compose
238 if (b0 < min4Primary) {
239 if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
241 int remainder = b2 % final3Multiplier;
242 if (remainder != 0) return -1;
244 b2 /= final3Multiplier;
245 result = ((b0 * medialCount) + b1) * final3Count + b2;
247 if (b2 < minTrail || b2 > maxTrail
248 || b3 < minTrail || b3 > max4Trail) return -1;
251 int remainder = b3 % final4Multiplier;
252 if (remainder != 0) return -1;
253 b3 /= final4Multiplier;
255 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
258 if (result < 0 || result > MAX_INPUT) return -1;
263 * Generate the implicit CE, from raw integer.
264 * Left shifted to put the first byte at the top of an int.
265 * @param cp code point
266 * @return Primary implicit weight
268 public int getImplicitFromRaw(int cp) {
269 if (cp < 0 || cp > MAX_INPUT) {
270 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
272 int last0 = cp - min4Boundary;
274 int last1 = cp / final3Count;
275 last0 = cp % final3Count;
277 int last2 = last1 / medialCount;
278 last1 %= medialCount;
280 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
281 last1 = minTrail + last1; // offset
282 last2 = min3Primary + last2; // offset
284 if (last2 >= min4Primary) {
285 throw new IllegalArgumentException("4-byte out of range: " +
286 Utility.hex(cp) + ", " + Utility.hex(last2));
289 return (last2 << 24) + (last1 << 16) + (last0 << 8);
291 int last1 = last0 / final4Count;
292 last0 %= final4Count;
294 int last2 = last1 / medialCount;
295 last1 %= medialCount;
297 int last3 = last2 / medialCount;
298 last2 %= medialCount;
300 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
301 last1 = minTrail + last1; // offset
302 last2 = minTrail + last2; // offset
303 last3 = min4Primary + last3; // offset
305 if (last3 > max4Primary) {
306 throw new IllegalArgumentException("4-byte out of range: " +
307 Utility.hex(cp) + ", " + Utility.hex(last3));
310 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
315 * Gets an Implicit from a code point. Internally,
316 * swaps (which produces a raw value 0..220000,
317 * then converts raw to implicit.
318 * @param cp The code point to convert to implicit.
319 * @return Primary implicit weight
321 public int getImplicitFromCodePoint(int cp) {
322 if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
325 // note, we add 1 so that the first value is always empty!!
326 cp = ImplicitCEGenerator.swapCJK(cp) + 1;
327 // we now have a range of numbers from 0 to 220000.
329 if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
331 return getImplicitFromRaw(cp);
336 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
337 * b) bump any non-CJK characters by 10FFFF.
338 * The relevant blocks are:
339 * A: 4E00..9FFF; CJK Unified Ideographs
340 * F900..FAFF; CJK Compatibility Ideographs
341 * B: 3400..4DBF; CJK Unified Ideographs Extension A
342 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
344 * no new B characters are allocated between 4E00 and FAFF, and
345 * no new A characters are outside of this range,
346 * (very high probability) this simple code will work.
347 * The reordered blocks are:
349 * Block2 is CJK_COMPAT_USED
352 * Any other CJK gets its normal code point
353 * Any non-CJK gets +10FFFF
354 * When we reorder Block1, we make sure that it is at the very start,
355 * so that it will use a 3-byte form.
356 * Warning: the we only pick up the compatibility characters that are
357 * NOT decomposed, so that block is smaller!
360 static int NON_CJK_OFFSET = 0x110000;
362 public static int swapCJK(int i) {
365 if (i < CJK_LIMIT) return i - CJK_BASE;
367 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
369 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
370 + (CJK_LIMIT - CJK_BASE);
371 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
373 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
375 if (i < CJK_C_BASE) return i + NON_CJK_OFFSET;
377 if (i < CJK_C_LIMIT) return i; // non-BMP-CJK
379 if (i < CJK_D_BASE) return i + NON_CJK_OFFSET;
381 if (i < CJK_D_LIMIT) return i; // non-BMP-CJK
383 return i + NON_CJK_OFFSET; // non-CJK
385 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
387 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
388 + (CJK_LIMIT - CJK_BASE)
389 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
390 return i + NON_CJK_OFFSET; // non-CJK
395 * @return Minimal trail value
397 public int getMinTrail() {
402 * @return Maximal trail value
404 public int getMaxTrail() {
408 public int getCodePointFromRaw(int i) {
411 if(i >= NON_CJK_OFFSET) {
412 result = i - NON_CJK_OFFSET;
413 } else if(i >= CJK_B_BASE) {
415 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
416 // rest of CJKs, compacted
417 if(i < CJK_LIMIT - CJK_BASE) {
418 result = i + CJK_BASE;
419 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
420 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
422 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
430 public int getRawFromCodePoint(int i) {