2 *******************************************************************************
\r
3 * Copyright (C) 2004-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
10 * For generation of Implicit CEs
\r
11 * @author Mark Davis
\r
13 * Cleaned up so that changes can be made more easily.
\r
15 # First Implicit: E26A792D
\r
16 # Last Implicit: E3DC70C0
\r
17 # First CJK: E0030300
\r
18 # Last CJK: E0A9DD00
\r
19 # First CJK_A: E0A9DF00
\r
20 # Last CJK_A: E0DE3100
\r
23 public class ImplicitCEGenerator {
\r
28 static final boolean DEBUG = false;
\r
30 static final long topByte = 0xFF000000L;
\r
31 static final long bottomByte = 0xFFL;
\r
32 static final long fourBytes = 0xFFFFFFFFL;
\r
34 static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
\r
36 public static final int CJK_BASE = 0x4E00;
\r
37 public static final int CJK_LIMIT = 0x9FFF+1;
\r
38 public static final int CJK_COMPAT_USED_BASE = 0xFA0E;
\r
39 public static final int CJK_COMPAT_USED_LIMIT = 0xFA2F+1;
\r
40 public static final int CJK_A_BASE = 0x3400;
\r
41 public static final int CJK_A_LIMIT = 0x4DBF+1;
\r
42 public static final int CJK_B_BASE = 0x20000;
\r
43 public static final int CJK_B_LIMIT = 0x2A6DF+1;
\r
45 // private void throwError(String title, int cp) {
\r
46 // throw new IllegalArgumentException(title + "\t" + Utility.hex(cp, 6) + "\t" +
\r
47 // Utility.hex(getImplicitFromRaw(cp) & fourBytes));
\r
50 // private void throwError(String title, long ce) {
\r
51 // throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
\r
54 // private void show(int i) {
\r
55 // if (i >= 0 && i <= MAX_INPUT) {
\r
56 // System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
\r
61 * Precomputed by constructor
\r
63 int final3Multiplier;
\r
64 int final4Multiplier;
\r
77 public int getGap4() {
\r
78 return final4Multiplier - 1;
\r
81 public int getGap3() {
\r
82 return final3Multiplier - 1;
\r
86 // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
\r
87 // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
\r
88 // we shift so that HAN all has the same first primary, for compression.
\r
89 // for the 4 byte case, we make the gap as large as we can fit.
\r
92 * Supply parameters for generating implicit CEs
\r
94 public ImplicitCEGenerator(int minPrimary, int maxPrimary) {
\r
95 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
\r
96 this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
\r
100 * Set up to generate implicits.
\r
101 * @param minPrimary The minimum primary value.
\r
102 * @param maxPrimary The maximum primary value.
\r
103 * @param minTrail final byte
\r
104 * @param maxTrail final byte
\r
105 * @param gap3 the gap we leave for tailoring for 3-byte forms
\r
106 * @param primaries3count number of 3-byte primarys we can use (normally 1)
\r
108 public ImplicitCEGenerator(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
\r
109 // some simple parameter checks
\r
110 if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
\r
111 throw new IllegalArgumentException("bad lead bytes");
\r
113 if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) {
\r
114 throw new IllegalArgumentException("bad trail bytes");
\r
116 if (primaries3count < 1) {
\r
117 throw new IllegalArgumentException("bad three-byte primaries");
\r
120 this.minTrail = minTrail;
\r
121 this.maxTrail = maxTrail;
\r
123 min3Primary = minPrimary;
\r
124 max4Primary = maxPrimary;
\r
125 // compute constants for use later.
\r
126 // number of values we can use in trailing bytes
\r
127 // leave room for empty values between AND above, e.g. if gap = 2
\r
128 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
\r
129 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
\r
130 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
\r
131 final3Multiplier = gap3 + 1;
\r
132 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
\r
133 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
\r
135 // medials can use full range
\r
136 medialCount = (maxTrail - minTrail + 1);
\r
137 // find out how many values fit in each form
\r
138 int threeByteCount = medialCount * final3Count;
\r
139 // now determine where the 3/4 boundary is.
\r
140 // we use 3 bytes below the boundary, and 4 above
\r
141 int primariesAvailable = maxPrimary - minPrimary + 1;
\r
142 int primaries4count = primariesAvailable - primaries3count;
\r
144 int min3ByteCoverage = primaries3count * threeByteCount;
\r
145 min4Primary = minPrimary + primaries3count;
\r
146 min4Boundary = min3ByteCoverage;
\r
147 // Now expand out the multiplier for the 4 bytes, and redo.
\r
149 int totalNeeded = MAX_INPUT - min4Boundary;
\r
150 int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
\r
151 if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
\r
153 int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
\r
154 if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
\r
156 int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
\r
157 if (DEBUG) System.out.println("expandedGap: " + gap4);
\r
158 if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
\r
160 final4Multiplier = gap4 + 1;
\r
161 final4Count = neededPerFinalByte;
\r
162 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
\r
164 if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
\r
165 throw new IllegalArgumentException("internal error");
\r
168 System.out.println("final4Count: " + final4Count);
\r
169 for (int counter = 0; counter < final4Count; ++counter) {
\r
170 int value = minTrail + (1 + counter)*final4Multiplier;
\r
171 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
\r
176 static public int divideAndRoundUp(int a, int b) {
\r
177 return 1 + (a-1)/b;
\r
181 * Converts implicit CE into raw integer
\r
182 * @param implicit The implicit value passed.
\r
183 * @return -1 if illegal format
\r
185 public int getRawFromImplicit(int implicit) {
\r
187 int b3 = implicit & 0xFF;
\r
189 int b2 = implicit & 0xFF;
\r
191 int b1 = implicit & 0xFF;
\r
193 int b0 = implicit & 0xFF;
\r
195 // simple parameter checks
\r
196 if (b0 < min3Primary || b0 > max4Primary
\r
197 || b1 < minTrail || b1 > maxTrail) return -1;
\r
201 // take care of the final values, and compose
\r
202 if (b0 < min4Primary) {
\r
203 if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
\r
205 int remainder = b2 % final3Multiplier;
\r
206 if (remainder != 0) return -1;
\r
208 b2 /= final3Multiplier;
\r
209 result = ((b0 * medialCount) + b1) * final3Count + b2;
\r
211 if (b2 < minTrail || b2 > maxTrail
\r
212 || b3 < minTrail || b3 > max4Trail) return -1;
\r
215 int remainder = b3 % final4Multiplier;
\r
216 if (remainder != 0) return -1;
\r
217 b3 /= final4Multiplier;
\r
219 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
\r
222 if (result < 0 || result > MAX_INPUT) return -1;
\r
227 * Generate the implicit CE, from raw integer.
\r
228 * Left shifted to put the first byte at the top of an int.
\r
229 * @param cp code point
\r
230 * @return Primary implicit weight
\r
232 public int getImplicitFromRaw(int cp) {
\r
233 if (cp < 0 || cp > MAX_INPUT) {
\r
234 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
\r
236 int last0 = cp - min4Boundary;
\r
238 int last1 = cp / final3Count;
\r
239 last0 = cp % final3Count;
\r
241 int last2 = last1 / medialCount;
\r
242 last1 %= medialCount;
\r
244 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
\r
245 last1 = minTrail + last1; // offset
\r
246 last2 = min3Primary + last2; // offset
\r
248 if (last2 >= min4Primary) {
\r
249 throw new IllegalArgumentException("4-byte out of range: " +
\r
250 Utility.hex(cp) + ", " + Utility.hex(last2));
\r
253 return (last2 << 24) + (last1 << 16) + (last0 << 8);
\r
255 int last1 = last0 / final4Count;
\r
256 last0 %= final4Count;
\r
258 int last2 = last1 / medialCount;
\r
259 last1 %= medialCount;
\r
261 int last3 = last2 / medialCount;
\r
262 last2 %= medialCount;
\r
264 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
\r
265 last1 = minTrail + last1; // offset
\r
266 last2 = minTrail + last2; // offset
\r
267 last3 = min4Primary + last3; // offset
\r
269 if (last3 > max4Primary) {
\r
270 throw new IllegalArgumentException("4-byte out of range: " +
\r
271 Utility.hex(cp) + ", " + Utility.hex(last3));
\r
274 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
\r
279 * Gets an Implicit from a code point. Internally,
\r
280 * swaps (which produces a raw value 0..220000,
\r
281 * then converts raw to implicit.
\r
282 * @param cp The code point to convert to implicit.
\r
283 * @return Primary implicit weight
\r
285 public int getImplicitFromCodePoint(int cp) {
\r
286 if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
\r
288 // Produce Raw value
\r
289 // note, we add 1 so that the first value is always empty!!
\r
290 cp = ImplicitCEGenerator.swapCJK(cp) + 1;
\r
291 // we now have a range of numbers from 0 to 220000.
\r
293 if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
\r
295 return getImplicitFromRaw(cp);
\r
299 * Function used to:
\r
300 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
\r
301 * b) bump any non-CJK characters by 10FFFF.
\r
302 * The relevant blocks are:
\r
303 * A: 4E00..9FFF; CJK Unified Ideographs
\r
304 * F900..FAFF; CJK Compatibility Ideographs
\r
305 * B: 3400..4DBF; CJK Unified Ideographs Extension A
\r
306 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
\r
308 * no new B characters are allocated between 4E00 and FAFF, and
\r
309 * no new A characters are outside of this range,
\r
310 * (very high probability) this simple code will work.
\r
311 * The reordered blocks are:
\r
313 * Block2 is CJK_COMPAT_USED
\r
316 * Any other CJK gets its normal code point
\r
317 * Any non-CJK gets +10FFFF
\r
318 * When we reorder Block1, we make sure that it is at the very start,
\r
319 * so that it will use a 3-byte form.
\r
320 * Warning: the we only pick up the compatibility characters that are
\r
321 * NOT decomposed, so that block is smaller!
\r
324 static int NON_CJK_OFFSET = 0x110000;
\r
326 static int swapCJK(int i) {
\r
328 if (i >= CJK_BASE) {
\r
329 if (i < CJK_LIMIT) return i - CJK_BASE;
\r
331 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
\r
333 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
\r
334 + (CJK_LIMIT - CJK_BASE);
\r
335 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
\r
337 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
\r
339 return i + NON_CJK_OFFSET; // non-CJK
\r
341 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
\r
343 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
\r
344 + (CJK_LIMIT - CJK_BASE)
\r
345 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
\r
346 return i + NON_CJK_OFFSET; // non-CJK
\r
351 * @return Minimal trail value
\r
353 public int getMinTrail() {
\r
358 * @return Maximal trail value
\r
360 public int getMaxTrail() {
\r
364 public int getCodePointFromRaw(int i) {
\r
367 if(i >= NON_CJK_OFFSET) {
\r
368 result = i - NON_CJK_OFFSET;
\r
369 } else if(i >= CJK_B_BASE) {
\r
371 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
\r
372 // rest of CJKs, compacted
\r
373 if(i < CJK_LIMIT - CJK_BASE) {
\r
374 result = i + CJK_BASE;
\r
375 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
\r
376 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
\r
378 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
\r
386 public int getRawFromCodePoint(int i) {
\r
387 return swapCJK(i)+1;
\r