2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.impl;
\r
9 import java.io.ByteArrayInputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.BufferedInputStream;
\r
12 import java.io.InputStream;
\r
13 import java.util.MissingResourceException;
\r
15 import com.ibm.icu.text.Normalizer;
\r
16 import com.ibm.icu.text.UTF16;
\r
17 import com.ibm.icu.text.UnicodeSet;
\r
18 import com.ibm.icu.text.UnicodeSetIterator;
\r
19 import com.ibm.icu.util.RangeValueIterator;
\r
20 import com.ibm.icu.util.VersionInfo;
\r
21 import com.ibm.icu.lang.UCharacter;
\r
25 * @author Ram Viswanadha
\r
27 public final class NormalizerImpl {
\r
28 // Static block for the class to initialize its own self
\r
29 static final NormalizerImpl IMPL;
\r
35 IMPL = new NormalizerImpl();
\r
39 throw new MissingResourceException(e.getMessage(), "", "");
\r
43 static final int UNSIGNED_BYTE_MASK =0xFF;
\r
44 static final long UNSIGNED_INT_MASK = 0xffffffffL;
\r
46 * This new implementation of the normalization code loads its data from
\r
47 * unorm.icu, which is generated with the gennorm tool.
\r
48 * The format of that file is described at the end of this file.
\r
50 private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";
\r
52 // norm32 value constants
\r
54 // quick check flags 0..3 set mean "no" for their forms
\r
55 public static final int QC_NFC=0x11; /* no|maybe */
\r
56 public static final int QC_NFKC=0x22; /* no|maybe */
\r
57 public static final int QC_NFD=4; /* no */
\r
58 public static final int QC_NFKD=8; /* no */
\r
60 public static final int QC_ANY_NO=0xf;
\r
62 /* quick check flags 4..5 mean "maybe" for their forms;
\r
63 * test flags>=QC_MAYBE
\r
65 public static final int QC_MAYBE=0x10;
\r
66 public static final int QC_ANY_MAYBE=0x30;
\r
68 public static final int QC_MASK=0x3f;
\r
70 private static final int COMBINES_FWD=0x40;
\r
71 private static final int COMBINES_BACK=0x80;
\r
72 public static final int COMBINES_ANY=0xc0;
\r
73 // UnicodeData.txt combining class in bits 15.
\r
74 private static final int CC_SHIFT=8;
\r
75 public static final int CC_MASK=0xff00;
\r
76 // 16 bits for the index to UChars and other extra data
\r
77 private static final int EXTRA_SHIFT=16;
\r
78 // start of surrogate specials after shift
\r
79 //private static final int EXTRA_INDEX_TOP=0xfc00;
\r
81 //private static final int EXTRA_SURROGATE_MASK=0x3ff;
\r
82 //private static final int EXTRA_SURROGATE_TOP=0x3f0; /* hangul etc. */
\r
84 //private static final int EXTRA_HANGUL=EXTRA_SURROGATE_TOP;
\r
85 //private static final int EXTRA_JAMO_L=EXTRA_SURROGATE_TOP+1;/* ### not used */
\r
86 //private static final int EXTRA_JAMO_V=EXTRA_SURROGATE_TOP+2;
\r
87 //private static final int EXTRA_JAMO_T=EXTRA_SURROGATE_TOP+3;
\r
89 /* norm32 value constants using >16 bits */
\r
90 private static final long MIN_SPECIAL = (long)(0xfc000000 & UNSIGNED_INT_MASK);
\r
91 private static final long SURROGATES_TOP = (long)(0xfff00000 & UNSIGNED_INT_MASK);
\r
92 private static final long MIN_HANGUL = (long)(0xfff00000 & UNSIGNED_INT_MASK);
\r
93 //private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
\r
94 private static final long JAMO_V_TOP = (long)(0xfff30000 & UNSIGNED_INT_MASK);
\r
97 /* indexes[] value names */
\r
98 /* number of bytes in normalization trie */
\r
99 static final int INDEX_TRIE_SIZE = 0;
\r
100 /* number of chars in extra data */
\r
101 static final int INDEX_CHAR_COUNT = 1;
\r
102 /* number of uint16_t words for combining data */
\r
103 static final int INDEX_COMBINE_DATA_COUNT = 2;
\r
104 /* number of code points that combine forward */
\r
105 static final int INDEX_COMBINE_FWD_COUNT = 3;
\r
106 /* number of code points that combine forward and backward */
\r
107 static final int INDEX_COMBINE_BOTH_COUNT = 4;
\r
108 /* number of code points that combine backward */
\r
109 static final int INDEX_COMBINE_BACK_COUNT = 5;
\r
110 /* first code point with quick check NFC NO/MAYBE */
\r
111 public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
\r
112 /* first code point with quick check NFKC NO/MAYBE */
\r
113 public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
\r
114 /* first code point with quick check NFD NO/MAYBE */
\r
115 public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
\r
116 /* first code point with quick check NFKD NO/MAYBE */
\r
117 public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
\r
118 /* number of bytes in FCD trie */
\r
119 static final int INDEX_FCD_TRIE_SIZE = 10;
\r
120 /* number of bytes in the auxiliary trie */
\r
121 static final int INDEX_AUX_TRIE_SIZE = 11;
\r
122 /* number of uint16_t in the array of serialized USet */
\r
123 static final int INDEX_CANON_SET_COUNT = 12;
\r
124 /* changing this requires a new formatVersion */
\r
125 static final int INDEX_TOP = 32;
\r
128 /* AUX constants */
\r
129 /* value constants for auxTrie */
\r
130 private static final int AUX_UNSAFE_SHIFT = 11;
\r
131 private static final int AUX_COMP_EX_SHIFT = 10;
\r
132 private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
\r
134 private static final int AUX_MAX_FNC = ((int)1<<AUX_COMP_EX_SHIFT);
\r
135 private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
\r
136 private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
\r
137 private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
\r
138 private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);
\r
140 /* canonStartSets[0..31] contains indexes for what is in the array */
\r
141 /* number of uint16_t in canonical starter sets */
\r
142 static final int SET_INDEX_CANON_SETS_LENGTH = 0;
\r
143 /* number of uint16_t in the BMP search table (contains pairs) */
\r
144 static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
\r
145 /* number of uint16_t in the supplementary search table(contains triplets)*/
\r
146 static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
\r
147 /* changing this requires a new formatVersion */
\r
148 static final int SET_INDEX_TOP = 32;
\r
150 static final int CANON_SET_INDICIES_INDEX = 0;
\r
151 static final int CANON_SET_START_SETS_INDEX = 1;
\r
152 static final int CANON_SET_BMP_TABLE_INDEX = 2;
\r
153 static final int CANON_SET_SUPP_TABLE_INDEX = 3;
\r
154 /* 14 bit indexes to canonical USerializedSets */
\r
155 static final int CANON_SET_MAX_CANON_SETS = 0x4000;
\r
156 /* single-code point BMP sets are encoded directly in the search table
\r
157 * except if result=0x4000..0x7fff
\r
159 static final int CANON_SET_BMP_MASK = 0xc000;
\r
160 static final int CANON_SET_BMP_IS_INDEX = 0x4000;
\r
162 private static final int MAX_BUFFER_SIZE = 20;
\r
165 * Internal option for cmpEquivFold() for decomposing.
\r
166 * If not set, just do strcasecmp().
\r
169 public static final int COMPARE_EQUIV = 0x80000;
\r
171 /*******************************/
\r
173 /* Wrappers for Trie implementations */
\r
174 static final class NormTrieImpl implements Trie.DataManipulate{
\r
175 static IntTrie normTrie= null;
\r
177 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
\r
178 * data the index array offset of the indexes for that lead surrogate.
\r
179 * @param property data value for a surrogate from the trie, including
\r
180 * the folding offset
\r
181 * @return data offset or 0 if there is no data for the lead surrogate
\r
183 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
\r
184 public int getFoldingOffset(int value){
\r
185 return BMP_INDEX_LENGTH+
\r
186 ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
\r
187 (0x3ff<<SURROGATE_BLOCK_BITS));
\r
191 static final class FCDTrieImpl implements Trie.DataManipulate{
\r
192 static CharTrie fcdTrie=null;
\r
194 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
\r
195 * data the index array offset of the indexes for that lead surrogate.
\r
196 * @param property data value for a surrogate from the trie, including
\r
197 * the folding offset
\r
198 * @return data offset or 0 if there is no data for the lead surrogate
\r
200 /* fcdTrie: the folding offset is the lead FCD value itself */
\r
201 public int getFoldingOffset(int value){
\r
206 static final class AuxTrieImpl implements Trie.DataManipulate{
\r
207 static CharTrie auxTrie = null;
\r
209 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
\r
210 * data the index array offset of the indexes for that lead surrogate.
\r
211 * @param property data value for a surrogate from the trie, including
\r
212 * the folding offset
\r
213 * @return data offset or 0 if there is no data for the lead surrogate
\r
215 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
\r
216 public int getFoldingOffset(int value){
\r
217 return (int)(value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
\r
221 /****************************************************/
\r
224 private static FCDTrieImpl fcdTrieImpl;
\r
225 private static NormTrieImpl normTrieImpl;
\r
226 private static AuxTrieImpl auxTrieImpl;
\r
227 private static int[] indexes;
\r
228 private static char[] combiningTable;
\r
229 private static char[] extraData;
\r
230 private static Object[] canonStartSets;
\r
232 private static boolean isDataLoaded;
\r
233 private static boolean isFormatVersion_2_1;
\r
234 private static boolean isFormatVersion_2_2;
\r
235 private static byte[] unicodeVersion;
\r
238 * Default buffer size of datafile
\r
240 private static final int DATA_BUFFER_SIZE = 25000;
\r
243 * FCD check: everything below this code point is known to have a 0
\r
244 * lead combining class
\r
246 public static final int MIN_WITH_LEAD_CC=0x300;
\r
250 * Bit 7 of the length byte for a decomposition string in extra data is
\r
251 * a flag indicating whether the decomposition string is
\r
252 * preceded by a 16-bit word with the leading and trailing cc
\r
253 * of the decomposition (like for A-umlaut);
\r
254 * if not, then both cc's are zero (like for compatibility ideographs).
\r
256 private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
\r
258 * Bits 6..0 of the length byte contain the actual length.
\r
260 private static final int DECOMP_LENGTH_MASK=0x7f;
\r
262 /** Length of the BMP portion of the index (stage 1) array. */
\r
263 private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
\r
264 /** Number of bits of a trail surrogate that are used in index table
\r
267 private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
\r
271 public static int getFromIndexesArr(int index){
\r
272 return indexes[index];
\r
275 // protected constructor ---------------------------------------------
\r
279 * @exception thrown when data reading fails or data corrupted
\r
281 private NormalizerImpl() throws IOException {
\r
282 //data should be loaded only once
\r
286 InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
\r
287 BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
\r
288 NormalizerDataReader reader = new NormalizerDataReader(b);
\r
290 // read the indexes
\r
291 indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
\r
293 byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
\r
295 int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
\r
296 combiningTable = new char[combiningTableTop];
\r
298 int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
\r
299 extraData = new char[extraDataTop];
\r
301 byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
\r
302 byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
\r
303 canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
\r
305 fcdTrieImpl = new FCDTrieImpl();
\r
306 normTrieImpl = new NormTrieImpl();
\r
307 auxTrieImpl = new AuxTrieImpl();
\r
309 // load the rest of the data data and initialize the data members
\r
310 reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable,
\r
313 NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
\r
314 FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl );
\r
315 AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
\r
317 // we reached here without any exceptions so the data is fully
\r
318 // loaded set the variable to true
\r
319 isDataLoaded = true;
\r
321 // get the data format version
\r
322 byte[] formatVersion = reader.getDataFormatVersion();
\r
324 isFormatVersion_2_1 =( formatVersion[0]>2
\r
326 (formatVersion[0]==2 && formatVersion[1]>=1)
\r
328 isFormatVersion_2_2 =( formatVersion[0]>2
\r
330 (formatVersion[0]==2 && formatVersion[1]>=2)
\r
332 unicodeVersion = reader.getUnicodeVersion();
\r
337 /* ---------------------------------------------------------------------- */
\r
339 /* Korean Hangul and Jamo constants */
\r
341 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
\r
342 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
\r
343 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
\r
345 public static final int HANGUL_BASE=0xac00;
\r
347 public static final int JAMO_L_COUNT=19;
\r
348 public static final int JAMO_V_COUNT=21;
\r
349 public static final int JAMO_T_COUNT=28;
\r
350 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
\r
352 private static boolean isHangulWithoutJamoT(char c) {
\r
354 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
\r
357 /* norm32 helpers */
\r
359 /* is this a norm32 with a regular index? */
\r
360 private static boolean isNorm32Regular(long norm32) {
\r
361 return norm32<MIN_SPECIAL;
\r
364 /* is this a norm32 with a special index for a lead surrogate? */
\r
365 private static boolean isNorm32LeadSurrogate(long norm32) {
\r
366 return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
\r
369 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
\r
370 private static boolean isNorm32HangulOrJamo(long norm32) {
\r
371 return norm32>=MIN_HANGUL;
\r
375 * Given isNorm32HangulOrJamo(),
\r
376 * is this a Hangul syllable or a Jamo?
\r
378 // private static boolean isHangulJamoNorm32HangulOrJamoL(long norm32) {
\r
379 // return norm32<MIN_JAMO_V;
\r
383 * Given norm32 for Jamo V or T,
\r
384 * is this a Jamo V?
\r
386 private static boolean isJamoVTNorm32JamoV(long norm32) {
\r
387 return norm32<JAMO_V_TOP;
\r
390 /* data access primitives ----------------------------------------------- */
\r
392 public static long/*unsigned*/ getNorm32(char c) {
\r
393 return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
\r
396 public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
\r
399 * the surrogate index in norm32 stores only the number of the surrogate
\r
400 * index block see gennorm/store.c/getFoldedNormValue()
\r
402 return ((UNSIGNED_INT_MASK) &
\r
403 NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
\r
405 private static long getNorm32(int c){
\r
406 return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
\r
409 // private static long getNorm32(int c,int mask){
\r
410 // long/*unsigned*/ norm32= getNorm32(UTF16.getLeadSurrogate(c));
\r
411 // if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
\r
412 // /* c is a lead surrogate, get the real norm32 */
\r
413 // norm32=getNorm32FromSurrogatePair(norm32,UTF16.getTrailSurrogate(c));
\r
419 * get a norm32 from text with complete code points
\r
420 * (like from decompositions)
\r
422 private static long/*unsigned*/ getNorm32(char[] p,int start,
\r
423 int/*unsigned*/ mask) {
\r
424 long/*unsigned*/ norm32= getNorm32(p[start]);
\r
425 if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
\r
426 /* *p is a lead surrogate, get the real norm32 */
\r
427 norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
\r
431 public static VersionInfo getUnicodeVersion(){
\r
432 return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
\r
433 unicodeVersion[2], unicodeVersion[3]);
\r
435 public static char getFCD16(char c) {
\r
436 return FCDTrieImpl.fcdTrie.getLeadValue(c);
\r
439 public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
\r
440 /* the surrogate index in fcd16 is an absolute offset over the
\r
441 * start of stage 1
\r
443 return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
\r
445 public static int getFCD16(int c) {
\r
446 return FCDTrieImpl.fcdTrie.getCodePointValue(c);
\r
449 private static int getExtraDataIndex(long norm32) {
\r
450 return (int)(norm32>>EXTRA_SHIFT);
\r
453 private static final class DecomposeArgs{
\r
454 int /*unsigned byte*/ cc;
\r
455 int /*unsigned byte*/ trailCC;
\r
460 * get the canonical or compatibility decomposition for one character
\r
462 * @return index into the extraData array
\r
464 private static int/*index*/ decompose(long/*unsigned*/ norm32,
\r
465 int/*unsigned*/ qcMask,
\r
466 DecomposeArgs args) {
\r
467 int p= getExtraDataIndex(norm32);
\r
468 args.length=extraData[p++];
\r
470 if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
\r
471 /* use compatibility decomposition, skip canonical data */
\r
472 p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
\r
476 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
\r
477 /* get the lead and trail cc's */
\r
478 char bothCCs=extraData[p++];
\r
479 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
\r
480 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
\r
482 /* lead and trail cc's are both 0 */
\r
483 args.cc=args.trailCC=0;
\r
486 args.length&=DECOMP_LENGTH_MASK;
\r
492 * get the canonical decomposition for one character
\r
493 * @return index into the extraData array
\r
495 private static int decompose(long/*unsigned*/ norm32,
\r
496 DecomposeArgs args) {
\r
498 int p= getExtraDataIndex(norm32);
\r
499 args.length=extraData[p++];
\r
501 if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
\r
502 /* get the lead and trail cc's */
\r
503 char bothCCs=extraData[p++];
\r
504 args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
\r
505 args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
\r
507 /* lead and trail cc's are both 0 */
\r
508 args.cc=args.trailCC=0;
\r
511 args.length&=DECOMP_LENGTH_MASK;
\r
516 private static final class NextCCArgs{
\r
525 * get the combining class of (c, c2)= args.source[args.next++]
\r
526 * before: args.next<args.limit after: args.next<=args.limit
\r
527 * if only one code unit is used, then c2==0
\r
529 private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
\r
530 long /*unsigned*/ norm32;
\r
532 args.c=args.source[args.next++];
\r
534 norm32= getNorm32(args.c);
\r
535 if((norm32 & CC_MASK)==0) {
\r
539 if(!isNorm32LeadSurrogate(norm32)) {
\r
542 /* c is a lead surrogate, get the real norm32 */
\r
543 if(args.next!=args.limit &&
\r
544 UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
\r
546 norm32=getNorm32FromSurrogatePair(norm32, args.c2);
\r
553 return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
\r
557 private static final class PrevArgs{
\r
566 * read backwards and get norm32
\r
567 * return 0 if the character is <minC
\r
568 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
\r
569 * surrogate but read second!)
\r
571 private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
\r
572 int/*unsigned*/ minC,
\r
573 int/*unsigned*/ mask) {
\r
574 long/*unsigned*/ norm32;
\r
576 args.c=args.src[--args.current];
\r
579 /* check for a surrogate before getting norm32 to see if we need to
\r
580 * predecrement further
\r
584 } else if(!UTF16.isSurrogate(args.c)) {
\r
585 return getNorm32(args.c);
\r
586 } else if(UTF16.isLeadSurrogate(args.c)) {
\r
587 /* unpaired first surrogate */
\r
589 } else if(args.current!=args.start &&
\r
590 UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
\r
592 norm32=getNorm32(args.c2);
\r
594 if((norm32&mask)==0) {
\r
595 /* all surrogate pairs with this lead surrogate have
\r
596 * only irrelevant data
\r
600 /* norm32 must be a surrogate special */
\r
601 return getNorm32FromSurrogatePair(norm32, args.c);
\r
604 /* unpaired second surrogate */
\r
611 * get the combining class of (c, c2)=*--p
\r
612 * before: start<p after: start<=p
\r
614 private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {
\r
616 return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
\r
617 CC_MASK)>>CC_SHIFT));
\r
621 * is this a safe boundary character for NF*D?
\r
624 public static boolean isNFDSafe(long/*unsigned*/ norm32,
\r
625 int/*unsigned*/ccOrQCMask,
\r
626 int/*unsigned*/ decompQCMask) {
\r
627 if((norm32&ccOrQCMask)==0) {
\r
628 return true; /* cc==0 and no decomposition: this is NF*D safe */
\r
631 /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
\r
632 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
\r
633 DecomposeArgs args=new DecomposeArgs();
\r
634 /* decomposes, get everything from the variable-length extra data */
\r
635 decompose(norm32, decompQCMask, args);
\r
638 /* no decomposition (or Hangul), test the cc directly */
\r
639 return (norm32&CC_MASK)==0;
\r
644 * is this (or does its decomposition begin with) a "true starter"?
\r
645 * (cc==0 and NF*C_YES)
\r
647 public static boolean isTrueStarter(long/*unsigned*/ norm32,
\r
648 int/*unsigned*/ ccOrQCMask,
\r
649 int/*unsigned*/ decompQCMask) {
\r
650 if((norm32&ccOrQCMask)==0) {
\r
651 return true; /* this is a true starter (could be Hangul or Jamo L)*/
\r
654 /* inspect its decomposition - not a Hangul or a surrogate here */
\r
655 if((norm32&decompQCMask)!=0) {
\r
656 int p; /* index into extra data array */
\r
657 DecomposeArgs args=new DecomposeArgs();
\r
658 /* decomposes, get everything from the variable-length extra data */
\r
659 p=decompose(norm32, decompQCMask, args);
\r
662 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;
\r
664 /* does it begin with NFC_YES? */
\r
665 if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
\r
666 /* yes, the decomposition begins with a true starter */
\r
674 /* reorder UTF-16 in-place ---------------------------------------------- */
\r
677 * simpler, single-character version of mergeOrdered() -
\r
678 * bubble-insert one single code point into the preceding string
\r
679 * which is already canonically ordered
\r
680 * (c, c2) may or may not yet have been inserted at src[current]..src[p]
\r
682 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
\r
684 * before: src[start]..src[current] is already ordered, and
\r
685 * src[current]..src[p] may or may not hold (c, c2) but
\r
686 * must be exactly the same length as (c, c2)
\r
687 * after: src[start]..src[p] is ordered
\r
689 * @return the trailing combining class
\r
691 private static int/*unsigned byte*/ insertOrdered(char[] source,
\r
693 int current, int p,
\r
695 int/*unsigned byte*/ cc) {
\r
698 int prevCC, trailCC=cc;
\r
700 if(start<current && cc!=0) {
\r
701 // search for the insertion point where cc>=prevCC
\r
702 preBack=back=current;
\r
703 PrevArgs prevArgs = new PrevArgs();
\r
704 prevArgs.current = current;
\r
705 prevArgs.start = start;
\r
706 prevArgs.src = source;
\r
708 prevCC=getPrevCC(prevArgs);
\r
709 preBack = prevArgs.current;
\r
712 // this will be the last code point, so keep its cc
\r
715 while(start<preBack) {
\r
716 prevCC=getPrevCC(prevArgs);
\r
717 preBack=prevArgs.current;
\r
725 // this is where we are right now with all these indicies:
\r
726 // [start]..[pPreBack] 0..? code points that we can ignore
\r
727 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
\r
728 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
\r
729 // [current]..[p] 1 code point (c, c2) with cc
\r
731 // move the code units in between up
\r
734 source[--r]=source[--current];
\r
735 } while(back!=current);
\r
742 source[(current+1)]=c2;
\r
745 // we know the cc of the last code point
\r
750 * merge two UTF-16 string parts together
\r
751 * to canonically order (order by combining classes) their concatenation
\r
753 * the two strings may already be adjacent, so that the merging is done
\r
754 * in-place if the two strings are not adjacent, then the buffer holding the
\r
755 * first one must be large enough
\r
756 * the second string may or may not be ordered in itself
\r
758 * before: [start]..[current] is already ordered, and
\r
759 * [next]..[limit] may be ordered in itself, but
\r
760 * is not in relation to [start..current[
\r
761 * after: [start..current+(limit-next)[ is ordered
\r
763 * the algorithm is a simple bubble-sort that takes the characters from
\r
764 * src[next++] and inserts them in correct combining class order into the
\r
765 * preceding part of the string
\r
767 * since this function is called much less often than the single-code point
\r
768 * insertOrdered(), it just uses that for easier maintenance
\r
770 * @return the trailing combining class
\r
772 private static int /*unsigned byte*/ mergeOrdered(char[] source,
\r
778 boolean isOrdered) {
\r
780 int /*unsigned byte*/ cc, trailCC=0;
\r
783 adjacent= current==next;
\r
784 NextCCArgs ncArgs = new NextCCArgs();
\r
785 ncArgs.source = data;
\r
786 ncArgs.next = next;
\r
787 ncArgs.limit = limit;
\r
789 if(start!=current || !isOrdered) {
\r
791 while(ncArgs.next<ncArgs.limit) {
\r
792 cc=getNextCC(ncArgs);
\r
794 // does not bubble back
\r
797 current=ncArgs.next;
\r
799 data[current++]=ncArgs.c;
\r
801 data[current++]=ncArgs.c2;
\r
810 r=current+(ncArgs.c2==0 ? 1 : 2);
\r
811 trailCC=insertOrdered(source,start, current, r,
\r
812 ncArgs.c, ncArgs.c2, cc);
\r
818 if(ncArgs.next==ncArgs.limit) {
\r
819 // we know the cc of the last code point
\r
823 // copy the second string part
\r
825 source[current++]=data[ncArgs.next++];
\r
826 } while(ncArgs.next!=ncArgs.limit);
\r
827 ncArgs.limit=current;
\r
829 PrevArgs prevArgs = new PrevArgs();
\r
830 prevArgs.src = data;
\r
831 prevArgs.start = start;
\r
832 prevArgs.current = ncArgs.limit;
\r
833 return getPrevCC(prevArgs);
\r
837 private static int /*unsigned byte*/ mergeOrdered(char[] source,
\r
843 return mergeOrdered(source,start,current,data,next,limit,true);
\r
848 public static boolean checkFCD(char[] src,int srcStart, int srcLimit,
\r
853 int i =srcStart, length = srcLimit;
\r
859 } else if((c=src[i++])<MIN_WITH_LEAD_CC) {
\r
861 } else if((fcd16=getFCD16(c))==0) {
\r
868 // check one above-minimum, relevant code unit
\r
869 if(UTF16.isLeadSurrogate(c)) {
\r
870 // c is a lead surrogate, get the real fcd16
\r
871 if(i!=length && UTF16.isTrailSurrogate(c2=src[i])) {
\r
873 fcd16=getFCD16FromSurrogatePair(fcd16, c2);
\r
882 if(nx_contains(nx, c, c2)) {
\r
883 prevCC=0; /* excluded: fcd16==0 */
\r
887 // prevCC has values from the following ranges:
\r
888 // 0..0xff -the previous trail combining class
\r
889 // <0 -the negative value of the previous code unit;
\r
890 // that code unit was <MIN_WITH_LEAD_CC and its getFCD16()
\r
891 // was deferred so that average text is checked faster
\r
894 // check the combining order
\r
895 cc=(int)(fcd16>>8);
\r
898 // the previous character was <_NORM_MIN_WITH_LEAD_CC,
\r
899 // we need to get its trail cc
\r
901 if(!nx_contains(nx, (int)-prevCC)) {
\r
902 prevCC=(int)(FCDTrieImpl.fcdTrie.getBMPValue(
\r
903 (char)-prevCC)&0xff
\r
906 prevCC=0; /* excluded: fcd16==0 */
\r
915 prevCC=(int)(fcd16&0xff);
\r
919 public static Normalizer.QuickCheckResult quickCheck(char[] src,
\r
925 boolean allowMaybe,
\r
933 Normalizer.QuickCheckResult result;
\r
934 ComposePartArgs args = new ComposePartArgs();
\r
936 int start = srcStart;
\r
938 if(!isDataLoaded) {
\r
939 return Normalizer.MAYBE;
\r
942 ccOrQCMask=CC_MASK|qcMask;
\r
943 result=Normalizer.YES;
\r
948 if(srcStart==srcLimit) {
\r
950 } else if((c=src[srcStart++])>=minNoMaybe &&
\r
951 (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
\r
958 // check one above-minimum, relevant code unit
\r
959 if(isNorm32LeadSurrogate(norm32)) {
\r
960 // c is a lead surrogate, get the real norm32
\r
961 if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
\r
963 norm32=getNorm32FromSurrogatePair(norm32,c2);
\r
971 if(nx_contains(nx, c, c2)) {
\r
972 /* excluded: norm32==0 */
\r
976 // check the combining order
\r
977 cc=(char)((norm32>>CC_SHIFT)&0xFF);
\r
978 if(cc!=0 && cc<prevCC) {
\r
979 return Normalizer.NO;
\r
983 // check for "no" or "maybe" quick check flags
\r
984 qcNorm32 = norm32 & qcMask;
\r
985 if((qcNorm32& QC_ANY_NO)>=1) {
\r
986 result= Normalizer.NO;
\r
988 } else if(qcNorm32!=0) {
\r
989 // "maybe" can only occur for NFC and NFKC
\r
991 result=Normalizer.MAYBE;
\r
993 // normalize a section around here to see if it is really
\r
994 // normalized or not
\r
996 int/*unsigned*/ decompQCMask;
\r
998 decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask
\r
1000 // find the previous starter
\r
1002 // set prevStarter to the beginning of the current character
\r
1003 prevStarter=srcStart-1;
\r
1004 if(UTF16.isTrailSurrogate(src[prevStarter])) {
\r
1005 // safe because unpaired surrogates do not result
\r
1010 prevStarter=findPreviousStarter(src, start, prevStarter,
\r
1011 ccOrQCMask, decompQCMask,
\r
1012 (char)minNoMaybe);
\r
1014 // find the next true starter in [src..limit[ - modifies
\r
1015 // src to point to the next starter
\r
1016 srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
\r
1017 decompQCMask,(char) minNoMaybe);
\r
1019 //set the args for compose part
\r
1020 args.prevCC = prevCC;
\r
1022 // decompose and recompose [prevStarter..src[
\r
1023 buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
\r
1025 // compare the normalized version with the original
\r
1026 if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
\r
1027 result=Normalizer.NO; // normalization differs
\r
1031 // continue after the next starter
\r
1039 //------------------------------------------------------
\r
1040 // make NFD & NFKD
\r
1041 //------------------------------------------------------
\r
1042 public static int getDecomposition(int c /*UTF-32*/ ,
\r
1046 int destCapacity) {
\r
1048 if( (UNSIGNED_INT_MASK & c)<=0x10ffff) {
\r
1049 long /*unsigned*/ norm32;
\r
1056 minNoMaybe=(int)indexes[INDEX_MIN_NFD_NO_MAYBE];
\r
1059 minNoMaybe=(int)indexes[INDEX_MIN_NFKD_NO_MAYBE];
\r
1063 if(c<minNoMaybe) {
\r
1065 if(destCapacity>0) {
\r
1072 norm32=getNorm32(c);
\r
1073 if((norm32&qcMask)==0) {
\r
1074 /* simple case: no decomposition */
\r
1076 if(destCapacity>0) {
\r
1081 if(destCapacity>=2) {
\r
1082 dest[0]=UTF16.getLeadSurrogate(c);
\r
1083 dest[1]=UTF16.getTrailSurrogate(c);
\r
1087 } else if(isNorm32HangulOrJamo(norm32)) {
\r
1088 /* Hangul syllable: decompose algorithmically */
\r
1093 c2=(char)(c%JAMO_T_COUNT);
\r
1096 if(destCapacity>=3) {
\r
1097 dest[2]=(char)(JAMO_T_BASE+c2);
\r
1104 if(destCapacity>=2) {
\r
1105 dest[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
\r
1106 dest[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
\r
1110 /* c decomposes, get everything from the variable-length extra
\r
1114 DecomposeArgs args = new DecomposeArgs();
\r
1115 /* the index into extra data array*/
\r
1116 p=decompose(norm32, qcMask, args);
\r
1117 if(args.length<=destCapacity) {
\r
1118 limit=p+args.length;
\r
1120 dest[destStart++]=extraData[p++];
\r
1123 return args.length;
\r
1131 public static int decompose(char[] src,int srcStart,int srcLimit,
\r
1132 char[] dest,int destStart,int destLimit,
\r
1133 boolean compat,int[] outTrailCC,
\r
1136 char[] buffer = new char[3];
\r
1139 int ccOrQCMask, qcMask;
\r
1140 int reorderStartIndex, length;
\r
1141 char c, c2, minNoMaybe;
\r
1142 int/*unsigned byte*/ cc, prevCC, trailCC;
\r
1145 int destIndex = destStart;
\r
1146 int srcIndex = srcStart;
\r
1148 minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
\r
1151 minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
\r
1156 ccOrQCMask=CC_MASK|qcMask;
\r
1157 reorderStartIndex=0;
\r
1163 cc=trailCC=-1;//initialize to bogus value
\r
1166 /* count code units below the minimum or with irrelevant data for
\r
1167 * the quick check
\r
1171 while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
\r
1172 ((norm32=getNorm32(c))&ccOrQCMask)==0)){
\r
1177 /* copy these code units all at once */
\r
1178 if(srcIndex!=prevSrc) {
\r
1179 length=(int)(srcIndex-prevSrc);
\r
1180 if((destIndex+length)<=destLimit) {
\r
1181 System.arraycopy(src,prevSrc,dest,destIndex,length);
\r
1184 destIndex+=length;
\r
1185 reorderStartIndex=destIndex;
\r
1188 /* end of source reached? */
\r
1189 if(srcIndex==srcLimit) {
\r
1193 /* c already contains *src and norm32 is set for it, increment src*/
\r
1196 /* check one above-minimum, relevant code unit */
\r
1198 * generally, set p and length to the decomposition string
\r
1199 * in simple cases, p==NULL and (c, c2) will hold the length code
\r
1200 * units to append in all cases, set cc to the lead and trailCC to
\r
1201 * the trail combining class
\r
1203 * the following merge-sort of the current character into the
\r
1204 * preceding, canonically ordered result text will use the
\r
1205 * optimized insertOrdered()
\r
1206 * if there is only one single code point to process;
\r
1207 * this is indicated with p==NULL, and (c, c2) is the character to
\r
1209 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
\r
1210 * for a supplementary character)
\r
1211 * otherwise, p[length] is merged in with _mergeOrdered()
\r
1213 if(isNorm32HangulOrJamo(norm32)) {
\r
1214 if(nx_contains(nx, c)) {
\r
1219 // Hangul syllable: decompose algorithmically
\r
1226 c2=(char)(c%JAMO_T_COUNT);
\r
1229 buffer[2]=(char)(JAMO_T_BASE+c2);
\r
1235 buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
\r
1236 buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
\r
1239 if(isNorm32Regular(norm32)) {
\r
1243 // c is a lead surrogate, get the real norm32
\r
1244 if(srcIndex!=srcLimit &&
\r
1245 UTF16.isTrailSurrogate(c2=src[srcIndex])) {
\r
1248 norm32=getNorm32FromSurrogatePair(norm32, c2);
\r
1256 /* get the decomposition and the lead and trail cc's */
\r
1257 if(nx_contains(nx, c, c2)) {
\r
1258 /* excluded: norm32==0 */
\r
1261 } else if((norm32&qcMask)==0) {
\r
1262 /* c does not decompose */
\r
1263 cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
\r
1267 DecomposeArgs arg = new DecomposeArgs();
\r
1268 /* c decomposes, get everything from the variable-length
\r
1271 pStart=decompose(norm32, qcMask, arg);
\r
1273 length=arg.length;
\r
1275 trailCC=arg.trailCC;
\r
1277 /* fastpath a single code unit from decomposition */
\r
1286 /* append the decomposition to the destination buffer, assume
\r
1289 if((destIndex+length)<=destLimit) {
\r
1290 int reorderSplit=destIndex;
\r
1292 /* fastpath: single code point */
\r
1293 if(cc!=0 && cc<prevCC) {
\r
1294 /* (c, c2) is out of order with respect to the preceding
\r
1297 destIndex+=length;
\r
1298 trailCC=insertOrdered(dest,reorderStartIndex,
\r
1299 reorderSplit, destIndex, c, c2, cc);
\r
1301 /* just append (c, c2) */
\r
1302 dest[destIndex++]=c;
\r
1304 dest[destIndex++]=c2;
\r
1308 /* general: multiple code points (ordered by themselves)
\r
1309 * from decomposition
\r
1311 if(cc!=0 && cc<prevCC) {
\r
1312 /* the decomposition is out of order with respect to the
\r
1315 destIndex+=length;
\r
1316 trailCC=mergeOrdered(dest,reorderStartIndex,
\r
1317 reorderSplit,p, pStart,pStart+length);
\r
1319 /* just append the decomposition */
\r
1321 dest[destIndex++]=p[pStart++];
\r
1322 } while(--length>0);
\r
1326 /* buffer overflow */
\r
1327 /* keep incrementing the destIndex for preflighting */
\r
1328 destIndex+=length;
\r
1333 reorderStartIndex=destIndex;
\r
1337 outTrailCC[0]=prevCC;
\r
1339 return destIndex - destStart;
\r
1342 /* make NFC & NFKC ------------------------------------------------------ */
\r
1343 private static final class NextCombiningArgs{
\r
1349 int/*unsigned*/ combiningIndex;
\r
1350 char /*unsigned byte*/ cc;
\r
1353 /* get the composition properties of the next character */
\r
1354 private static int /*unsigned*/ getNextCombining(NextCombiningArgs args,
\r
1357 long/*unsigned*/ norm32;
\r
1359 /* get properties */
\r
1360 args.c=args.source[args.start++];
\r
1361 norm32=getNorm32(args.c);
\r
1363 /* preset output values for most characters */
\r
1365 args.combiningIndex=0;
\r
1368 if((norm32&(CC_MASK|COMBINES_ANY))==0) {
\r
1371 if(isNorm32Regular(norm32)) {
\r
1372 /* set cc etc. below */
\r
1373 } else if(isNorm32HangulOrJamo(norm32)) {
\r
1374 /* a compatibility decomposition contained Jamos */
\r
1375 args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
\r
1376 (norm32>>EXTRA_SHIFT)));
\r
1377 return (int)(norm32&COMBINES_ANY);
\r
1379 /* c is a lead surrogate, get the real norm32 */
\r
1380 if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
\r
1381 args.source[args.start])) {
\r
1383 norm32=getNorm32FromSurrogatePair(norm32, args.c2);
\r
1390 if(nx_contains(nx, args.c, args.c2)) {
\r
1391 return 0; /* excluded: norm32==0 */
\r
1394 args.cc= (char)((norm32>>CC_SHIFT)&0xff);
\r
1396 combineFlags=(int)(norm32&COMBINES_ANY);
\r
1397 if(combineFlags!=0) {
\r
1398 int index = getExtraDataIndex(norm32);
\r
1399 args.combiningIndex=index>0 ? extraData[(index-1)] :0;
\r
1402 return combineFlags;
\r
1407 * given a composition-result starter (c, c2) - which means its cc==0,
\r
1408 * it combines forward, it has extra data, its norm32!=0,
\r
1409 * it is not a Hangul or Jamo,
\r
1410 * get just its combineFwdIndex
\r
1412 * norm32(c) is special if and only if c2!=0
\r
1414 private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
\r
1415 long/*unsigned*/ norm32;
\r
1417 norm32=getNorm32(c);
\r
1419 norm32=getNorm32FromSurrogatePair(norm32, c2);
\r
1421 return extraData[(getExtraDataIndex(norm32)-1)];
\r
1425 * Find the recomposition result for
\r
1426 * a forward-combining character
\r
1427 * (specified with a pointer to its part of the combiningTable[])
\r
1428 * and a backward-combining character
\r
1429 * (specified with its combineBackIndex).
\r
1431 * If these two characters combine, then set (value, value2)
\r
1432 * with the code unit(s) of the composition character.
\r
1435 * 0 do not combine
\r
1437 * >1 combine, and the composition is a forward-combining starter
\r
1439 * See unormimp.h for a description of the composition table format.
\r
1441 private static int/*unsigned*/ combine(char[]table,int tableStart,
\r
1442 int/*unsinged*/ combineBackIndex,
\r
1443 int[] outValues) {
\r
1444 int/*unsigned*/ key;
\r
1447 if(outValues.length<2){
\r
1448 throw new IllegalArgumentException();
\r
1451 /* search in the starter's composition table */
\r
1453 key=table[tableStart++];
\r
1454 if(key>=combineBackIndex) {
\r
1457 tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
\r
1460 /* mask off bit 15, the last-entry-in-the-list flag */
\r
1461 if((key&0x7fff)==combineBackIndex) {
\r
1462 /* found! combine! */
\r
1463 value=table[tableStart];
\r
1465 /* is the composition a starter that combines forward? */
\r
1466 key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));
\r
1468 /* get the composition result code point from the variable-length
\r
1471 if((value&0x8000) != 0) {
\r
1472 if((value&0x4000) != 0) {
\r
1473 /* surrogate pair composition result */
\r
1474 value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
\r
1475 value2=table[tableStart+1];
\r
1477 /* BMP composition result U+2000..U+ffff */
\r
1478 value=table[tableStart+1];
\r
1482 /* BMP composition result U+0000..U+1fff */
\r
1486 outValues[0]=value;
\r
1487 outValues[1]=value2;
\r
1496 private static final class RecomposeArgs{
\r
1502 * recompose the characters in [p..limit[
\r
1503 * (which is in NFD - decomposed and canonically ordered),
\r
1504 * adjust limit, and return the trailing cc
\r
1506 * since for NFKC we may get Jamos in decompositions, we need to
\r
1507 * recompose those too
\r
1509 * note that recomposition never lengthens the text:
\r
1510 * any character consists of either one or two code units;
\r
1511 * a composition may contain at most one more code unit than the original
\r
1512 * starter, while the combining mark that is removed has at least one code
\r
1515 private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
\r
1517 int /*unsigned*/ combineFlags;
\r
1518 int /*unsigned*/ combineFwdIndex, combineBackIndex;
\r
1519 int /*unsigned*/ result, value=0, value2=0;
\r
1520 int /*unsigned byte*/ prevCC;
\r
1521 boolean starterIsSupplementary;
\r
1523 int[] outValues = new int[2];
\r
1524 starter=-1; /* no starter */
\r
1525 combineFwdIndex=0; /* will not be used until starter!=NULL */
\r
1526 starterIsSupplementary=false; /* will not be used until starter!=NULL */
\r
1529 NextCombiningArgs ncArg = new NextCombiningArgs();
\r
1530 ncArg.source = args.source;
\r
1536 ncArg.start = args.start;
\r
1537 combineFlags=getNextCombining(ncArg,args.limit,nx);
\r
1538 combineBackIndex=ncArg.combiningIndex;
\r
1539 args.start = ncArg.start;
\r
1541 if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
\r
1542 if((combineBackIndex&0x8000)!=0) {
\r
1543 /* c is a Jamo V/T, see if we can compose it with the
\r
1544 * previous character
\r
1546 /* for the PRI #29 fix, check that there is no intervening combining mark */
\r
1547 if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
\r
1548 remove=-1; /* NULL while no Hangul composition */
\r
1550 ncArg.c2=args.source[starter];
\r
1551 if(combineBackIndex==0xfff2) {
\r
1552 /* Jamo V, compose with previous Jamo L and following
\r
1555 ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
\r
1556 if(ncArg.c2<JAMO_L_COUNT) {
\r
1557 remove=args.start-1;
\r
1558 ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
\r
1559 (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
\r
1560 if(args.start!=args.limit &&
\r
1561 (ncArg.c2=(char)(args.source[args.start]
\r
1562 -JAMO_T_BASE))<JAMO_T_COUNT) {
\r
1564 ncArg.c+=ncArg.c2;
\r
1566 /* the result is an LV syllable, which is a starter (unlike LVT) */
\r
1567 combineFlags=COMBINES_FWD;
\r
1569 if(!nx_contains(nx, ncArg.c)) {
\r
1570 args.source[starter]=ncArg.c;
\r
1573 if(!isHangulWithoutJamoT(ncArg.c)) {
\r
1574 --args.start; /* undo the ++args.start from reading the Jamo T */
\r
1576 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
\r
1577 remove=args.start;
\r
1582 * Normally, the following can not occur:
\r
1583 * Since the input is in NFD, there are no Hangul LV syllables that
\r
1584 * a Jamo T could combine with.
\r
1585 * All Jamo Ts are combined above when handling Jamo Vs.
\r
1587 * However, before the PRI #29 fix, this can occur due to
\r
1588 * an intervening combining mark between the Hangul LV and the Jamo T.
\r
1591 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
\r
1592 if(isHangulWithoutJamoT(ncArg.c2)) {
\r
1593 ncArg.c2+=ncArg.c-JAMO_T_BASE;
\r
1594 if(!nx_contains(nx, ncArg.c2)) {
\r
1595 remove=args.start-1;
\r
1596 args.source[starter]=ncArg.c2;
\r
1602 /* remove the Jamo(s) */
\r
1605 while(r<args.limit) {
\r
1606 args.source[q++]=args.source[r++];
\r
1608 args.start=remove;
\r
1612 ncArg.c2=0; /* c2 held *starter temporarily */
\r
1614 if(combineFlags!=0) {
\r
1616 * not starter=NULL because the composition is a Hangul LV syllable
\r
1617 * and might combine once more (but only before the PRI #29 fix)
\r
1621 if(args.start==args.limit) {
\r
1622 return (char)prevCC;
\r
1625 /* the composition is a Hangul LV syllable which is a starter that combines forward */
\r
1626 combineFwdIndex=0xfff0;
\r
1628 /* we combined; continue with looking for compositions */
\r
1634 * now: cc==0 and the combining index does not include
\r
1635 * "forward" -> the rest of the loop body will reset starter
\r
1636 * to NULL; technically, a composed Hangul syllable is a
\r
1637 * starter, but it does not combine forward now that we have
\r
1638 * consumed all eligible Jamos; for Jamo V/T, combineFlags
\r
1639 * does not contain _NORM_COMBINES_FWD
\r
1643 /* the starter is not a Hangul LV or Jamo V/T and */
\r
1644 !((combineFwdIndex&0x8000)!=0) &&
\r
1645 /* the combining mark is not blocked and */
\r
1646 ((options&BEFORE_PRI_29)!=0 ?
\r
1647 (prevCC!=ncArg.cc || prevCC==0) :
\r
1648 (prevCC<ncArg.cc || prevCC==0)) &&
\r
1649 /* the starter and the combining mark (c, c2) do combine */
\r
1650 0!=(result=combine(combiningTable,combineFwdIndex,
\r
1651 combineBackIndex, outValues)) &&
\r
1652 /* the composition result is not excluded */
\r
1653 !nx_contains(nx, (char)value, (char)value2)
\r
1655 value=outValues[0];
\r
1656 value2=outValues[1];
\r
1657 /* replace the starter with the composition, remove the
\r
1660 remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
\r
1662 /* replace the starter with the composition */
\r
1663 args.source[starter]=(char)value;
\r
1664 if(starterIsSupplementary) {
\r
1666 /* both are supplementary */
\r
1667 args.source[starter+1]=(char)value2;
\r
1669 /* the composition is shorter than the starter,
\r
1670 * move the intermediate characters forward one */
\r
1671 starterIsSupplementary=false;
\r
1675 args.source[q++]=args.source[r++];
\r
1679 } else if(value2!=0) {
\r
1680 /* the composition is longer than the starter,
\r
1681 * move the intermediate characters back one */
\r
1682 starterIsSupplementary=true;
\r
1683 /* temporarily increment for the loop boundary */
\r
1687 while(starter<q) {
\r
1688 args.source[--r]=args.source[--q];
\r
1690 args.source[starter]=(char)value2;
\r
1691 --starter; /* undo the temporary increment */
\r
1692 /* } else { both are on the BMP, nothing more to do */
\r
1695 /* remove the combining mark by moving the following text
\r
1697 if(remove<args.start) {
\r
1700 while(r<args.limit) {
\r
1701 args.source[q++]=args.source[r++];
\r
1703 args.start=remove;
\r
1707 /* keep prevCC because we removed the combining mark */
\r
1710 if(args.start==args.limit) {
\r
1711 return (char)prevCC;
\r
1714 /* is the composition a starter that combines forward? */
\r
1716 combineFwdIndex=getCombiningIndexFromStarter((char)value,
\r
1722 /* we combined; continue with looking for compositions */
\r
1727 /* no combination this time */
\r
1729 if(args.start==args.limit) {
\r
1730 return (char)prevCC;
\r
1733 /* if (c, c2) did not combine, then check if it is a starter */
\r
1735 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
\r
1736 if((combineFlags&COMBINES_FWD)!=0) {
\r
1737 /* it may combine with something, prepare for it */
\r
1739 starterIsSupplementary=false;
\r
1740 starter=args.start-1;
\r
1742 starterIsSupplementary=false;
\r
1743 starter=args.start-2;
\r
1745 combineFwdIndex=combineBackIndex;
\r
1747 /* it will not combine with anything */
\r
1750 } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
\r
1751 /* FCC: no discontiguous compositions; any intervening character blocks */
\r
1757 // find the last true starter between src[start]....src[current] going
\r
1758 // backwards and return its index
\r
1759 private static int findPreviousStarter(char[]src, int srcStart, int current,
\r
1760 int/*unsigned*/ ccOrQCMask,
\r
1761 int/*unsigned*/ decompQCMask,
\r
1762 char minNoMaybe) {
\r
1764 PrevArgs args = new PrevArgs();
\r
1766 args.start = srcStart;
\r
1767 args.current = current;
\r
1769 while(args.start<args.current) {
\r
1770 norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
\r
1771 if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
\r
1775 return args.current;
\r
1778 /* find the first true starter in [src..limit[ and return the
\r
1781 private static int/*index*/ findNextStarter(char[] src,int start,int limit,
\r
1782 int/*unsigned*/ qcMask,
\r
1783 int/*unsigned*/ decompQCMask,
\r
1784 char minNoMaybe) {
\r
1786 long/*unsigned*/ norm32;
\r
1790 ccOrQCMask=CC_MASK|qcMask;
\r
1792 DecomposeArgs decompArgs = new DecomposeArgs();
\r
1795 if(start==limit) {
\r
1796 break; /* end of string */
\r
1799 if(c<minNoMaybe) {
\r
1800 break; /* catches NUL terminater, too */
\r
1803 norm32=getNorm32(c);
\r
1804 if((norm32&ccOrQCMask)==0) {
\r
1805 break; /* true starter */
\r
1808 if(isNorm32LeadSurrogate(norm32)) {
\r
1809 /* c is a lead surrogate, get the real norm32 */
\r
1810 if((start+1)==limit ||
\r
1811 !UTF16.isTrailSurrogate(c2=(src[start+1]))){
\r
1812 /* unmatched first surrogate: counts as a true starter */
\r
1815 norm32=getNorm32FromSurrogatePair(norm32, c2);
\r
1817 if((norm32&ccOrQCMask)==0) {
\r
1818 break; /* true starter */
\r
1824 /* (c, c2) is not a true starter but its decomposition may be */
\r
1825 if((norm32&decompQCMask)!=0) {
\r
1826 /* (c, c2) decomposes, get everything from the variable-length
\r
1828 p=decompose(norm32, decompQCMask, decompArgs);
\r
1830 /* get the first character's norm32 to check if it is a true
\r
1832 if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
\r
1833 break; /* true starter */
\r
1837 start+= c2==0 ? 1 : 2; /* not a true starter, continue */
\r
1844 private static final class ComposePartArgs{
\r
1846 int length; /* length of decomposed part */
\r
1849 /* decompose and recompose [prevStarter..src[ */
\r
1850 private static char[] composePart(ComposePartArgs args,
\r
1852 char[] src, int start, int limit,
\r
1855 int recomposeLimit;
\r
1856 boolean compat =((options&OPTIONS_COMPAT)!=0);
\r
1858 /* decompose [prevStarter..src[ */
\r
1859 int[] outTrailCC = new int[1];
\r
1860 char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];
\r
1863 args.length=decompose(src,prevStarter,(start),
\r
1864 buffer,0,buffer.length,
\r
1865 compat,outTrailCC,nx);
\r
1866 if(args.length<=buffer.length){
\r
1869 buffer = new char[args.length];
\r
1873 /* recompose the decomposition */
\r
1874 recomposeLimit=args.length;
\r
1876 if(args.length>=2) {
\r
1877 RecomposeArgs rcArgs = new RecomposeArgs();
\r
1878 rcArgs.source = buffer;
\r
1880 rcArgs.limit = recomposeLimit;
\r
1881 args.prevCC=recompose(rcArgs, options, nx);
\r
1882 recomposeLimit = rcArgs.limit;
\r
1885 /* return with a pointer to the recomposition and its length */
\r
1886 args.length=recomposeLimit;
\r
1890 private static boolean composeHangul(char prev, char c,
\r
1891 long/*unsigned*/ norm32,
\r
1892 char[] src,int[] srcIndex, int limit,
\r
1894 char[] dest,int destIndex,
\r
1896 int start=srcIndex[0];
\r
1897 if(isJamoVTNorm32JamoV(norm32)) {
\r
1898 /* c is a Jamo V, compose with previous Jamo L and
\r
1899 * following Jamo T */
\r
1900 prev=(char)(prev-JAMO_L_BASE);
\r
1901 if(prev<JAMO_L_COUNT) {
\r
1902 c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
\r
1903 (c-JAMO_V_BASE))*JAMO_T_COUNT);
\r
1905 /* check if the next character is a Jamo T (normal or
\r
1906 * compatibility) */
\r
1907 if(start!=limit) {
\r
1911 if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
\r
1912 /* normal Jamo T */
\r
1915 } else if(compat) {
\r
1916 /* if NFKC, then check for compatibility Jamo T
\r
1918 norm32=getNorm32(next);
\r
1919 if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
\r
1920 int p /*index into extra data array*/;
\r
1921 DecomposeArgs dcArgs = new DecomposeArgs();
\r
1922 p=decompose(norm32, QC_NFKD, dcArgs);
\r
1923 if(dcArgs.length==1 &&
\r
1924 (t=(char)(extraData[p]-JAMO_T_BASE))
\r
1926 /* compatibility Jamo T */
\r
1933 if(nx_contains(nx, c)) {
\r
1934 if(!isHangulWithoutJamoT(c)) {
\r
1935 --start; /* undo ++start from reading the Jamo T */
\r
1939 dest[destIndex]=c;
\r
1940 srcIndex[0]=start;
\r
1943 } else if(isHangulWithoutJamoT(prev)) {
\r
1944 /* c is a Jamo T, compose with previous Hangul LV that does not
\r
1945 * contain a Jamo T */
\r
1946 c=(char)(prev+(c-JAMO_T_BASE));
\r
1947 if(nx_contains(nx, c)) {
\r
1950 dest[destIndex]=c;
\r
1951 srcIndex[0]=start;
\r
1957 public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
\r
1958 return compose(src,0,src.length,dest,0,dest.length,compat, nx);
\r
1962 public static int compose(char[] src, int srcStart, int srcLimit,
\r
1963 char[] dest,int destStart,int destLimit,
\r
1964 int options,UnicodeSet nx) {
\r
1966 int prevSrc, prevStarter;
\r
1967 long/*unsigned*/ norm32;
\r
1968 int ccOrQCMask, qcMask;
\r
1969 int reorderStartIndex, length;
\r
1970 char c, c2, minNoMaybe;
\r
1971 int/*unsigned byte*/ cc, prevCC;
\r
1972 int[] ioIndex = new int[1];
\r
1973 int destIndex = destStart;
\r
1974 int srcIndex = srcStart;
\r
1976 if((options&OPTIONS_COMPAT)!=0) {
\r
1977 minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
\r
1980 minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
\r
1985 * prevStarter points to the last character before the current one
\r
1986 * that is a "true" starter with cc==0 and quick check "yes".
\r
1988 * prevStarter will be used instead of looking for a true starter
\r
1989 * while incrementally decomposing [prevStarter..prevSrc[
\r
1990 * in _composePart(). Having a good prevStarter allows to just decompose
\r
1991 * the entire [prevStarter..prevSrc[.
\r
1993 * When _composePart() backs out from prevSrc back to prevStarter,
\r
1994 * then it also backs out destIndex by the same amount.
\r
1995 * Therefore, at all times, the (prevSrc-prevStarter) source units
\r
1996 * must correspond 1:1 to destination units counted with destIndex,
\r
1997 * except for reordering.
\r
1998 * This is true for the qc "yes" characters copied in the fast loop,
\r
1999 * and for pure reordering.
\r
2000 * prevStarter must be set forward to src when this is not true:
\r
2001 * In _composePart() and after composing a Hangul syllable.
\r
2003 * This mechanism relies on the assumption that the decomposition of a
\r
2004 * true starter also begins with a true starter. gennorm/store.c checks
\r
2007 prevStarter=srcIndex;
\r
2009 ccOrQCMask=CC_MASK|qcMask;
\r
2010 /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
\r
2013 /* avoid compiler warnings */
\r
2018 /* count code units below the minimum or with irrelevant data for
\r
2019 * the quick check */
\r
2022 while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
\r
2023 ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
\r
2029 /* copy these code units all at once */
\r
2030 if(srcIndex!=prevSrc) {
\r
2031 length=(int)(srcIndex-prevSrc);
\r
2032 if((destIndex+length)<=destLimit) {
\r
2033 System.arraycopy(src,prevSrc,dest,destIndex,length);
\r
2035 destIndex+=length;
\r
2036 reorderStartIndex=destIndex;
\r
2038 /* set prevStarter to the last character in the quick check
\r
2040 prevStarter=srcIndex-1;
\r
2041 if(UTF16.isTrailSurrogate(src[prevStarter]) &&
\r
2042 prevSrc<prevStarter &&
\r
2043 UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
\r
2050 /* end of source reached? */
\r
2051 if(srcIndex==srcLimit) {
\r
2055 /* c already contains *src and norm32 is set for it, increment src*/
\r
2059 * source buffer pointers:
\r
2061 * all done quick check current char not yet
\r
2062 * "yes" but (c, c2) processed
\r
2065 * [-------------[-------------[-------------[-------------[
\r
2067 * start prevStarter prevSrc src limit
\r
2070 * destination buffer pointers and indexes:
\r
2072 * all done might take not filled yet
\r
2075 * [-------------[-------------[-------------[
\r
2077 * dest reorderStartIndex destIndex destCapacity
\r
2080 /* check one above-minimum, relevant code unit */
\r
2082 * norm32 is for c=*(src-1), and the quick check flag is "no" or
\r
2083 * "maybe", and/or cc!=0
\r
2084 * check for Jamo V/T, then for surrogates and regular characters
\r
2085 * c is not a Hangul syllable or Jamo L because
\r
2086 * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
\r
2088 if(isNorm32HangulOrJamo(norm32)) {
\r
2090 * c is a Jamo V/T:
\r
2091 * try to compose with the previous character, Jamo V also with
\r
2092 * a following Jamo T, and set values here right now in case we
\r
2093 * just continue with the main loop
\r
2096 reorderStartIndex=destIndex;
\r
2097 ioIndex[0]=srcIndex;
\r
2100 composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
\r
2101 srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
\r
2102 destIndex<=destLimit ? destIndex-1: 0,
\r
2105 srcIndex=ioIndex[0];
\r
2106 prevStarter=srcIndex;
\r
2110 srcIndex = ioIndex[0];
\r
2112 /* the Jamo V/T did not compose into a Hangul syllable, just
\r
2113 * append to dest */
\r
2116 prevStarter=prevSrc;
\r
2118 if(isNorm32Regular(norm32)) {
\r
2122 /* c is a lead surrogate, get the real norm32 */
\r
2123 if(srcIndex!=srcLimit &&
\r
2124 UTF16.isTrailSurrogate(c2=src[srcIndex])) {
\r
2127 norm32=getNorm32FromSurrogatePair(norm32, c2);
\r
2129 /* c is an unpaired lead surrogate, nothing to do */
\r
2135 ComposePartArgs args =new ComposePartArgs();
\r
2137 /* we are looking at the character (c, c2) at [prevSrc..src[ */
\r
2138 if(nx_contains(nx, c, c2)) {
\r
2139 /* excluded: norm32==0 */
\r
2141 } else if((norm32&qcMask)==0) {
\r
2142 cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
\r
2147 * find appropriate boundaries around this character,
\r
2148 * decompose the source text from between the boundaries,
\r
2149 * and recompose it
\r
2151 * this puts the intermediate text into the side buffer because
\r
2152 * it might be longer than the recomposition end result,
\r
2153 * or the destination buffer may be too short or missing
\r
2155 * note that destIndex may be adjusted backwards to account
\r
2156 * for source text that passed the quick check but needed to
\r
2157 * take part in the recomposition
\r
2159 int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
\r
2161 * find the last true starter in [prevStarter..src[
\r
2162 * it is either the decomposition of the current character (at prevSrc),
\r
2165 if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
\r
2166 prevStarter=prevSrc;
\r
2168 /* adjust destIndex: back out what had been copied with qc "yes" */
\r
2169 destIndex-=prevSrc-prevStarter;
\r
2172 /* find the next true starter in [src..limit[ */
\r
2173 srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
\r
2174 decompQCMask, minNoMaybe);
\r
2175 //args.prevStarter = prevStarter;
\r
2176 args.prevCC = prevCC;
\r
2177 //args.destIndex = destIndex;
\r
2178 args.length = length;
\r
2179 p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);
\r
2182 /* an error occurred (out of memory) */
\r
2186 prevCC = args.prevCC;
\r
2187 length = args.length;
\r
2189 /* append the recomposed buffer contents to the destination
\r
2191 if((destIndex+args.length)<=destLimit) {
\r
2193 while(i<args.length) {
\r
2194 dest[destIndex++]=p[i++];
\r
2198 /* buffer overflow */
\r
2199 /* keep incrementing the destIndex for preflighting */
\r
2200 destIndex+=length;
\r
2203 prevStarter=srcIndex;
\r
2208 /* append the single code point (c, c2) to the destination buffer */
\r
2209 if((destIndex+length)<=destLimit) {
\r
2210 if(cc!=0 && cc<prevCC) {
\r
2211 /* (c, c2) is out of order with respect to the preceding
\r
2213 int reorderSplit= destIndex;
\r
2214 destIndex+=length;
\r
2215 prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
\r
2216 destIndex, c, c2, cc);
\r
2218 /* just append (c, c2) */
\r
2219 dest[destIndex++]=c;
\r
2221 dest[destIndex++]=c2;
\r
2226 /* buffer overflow */
\r
2227 /* keep incrementing the destIndex for preflighting */
\r
2228 destIndex+=length;
\r
2233 return destIndex - destStart;
\r
2235 /* make FCD --------------------------------------------------------------*/
\r
2237 private static int/*index*/ findSafeFCD(char[] src, int start, int limit,
\r
2242 * find the first position in [src..limit[ after some cc==0 according
\r
2245 * at the beginning of the loop, we have fcd16 from before src
\r
2247 * stop at positions:
\r
2248 * - after trail cc==0
\r
2249 * - at the end of the source
\r
2250 * - before lead cc==0
\r
2253 /* stop if trail cc==0 for the previous character */
\r
2254 if((fcd16&0xff)==0) {
\r
2258 /* get c=*src - stop at end of string */
\r
2259 if(start==limit) {
\r
2264 /* stop if lead cc==0 for this character */
\r
2265 if(c<MIN_WITH_LEAD_CC || (fcd16=getFCD16(c))==0) {
\r
2266 break; /* catches terminating NUL, too */
\r
2269 if(!UTF16.isLeadSurrogate(c)) {
\r
2274 } else if(start+1!=limit &&
\r
2275 (UTF16.isTrailSurrogate(c2=src[start+1]))) {
\r
2276 /* c is a lead surrogate, get the real fcd16 */
\r
2277 fcd16=getFCD16FromSurrogatePair(fcd16, c2);
\r
2283 /* c is an unpaired first surrogate, lead cc==0 */
\r
2291 private static int/*unsigned byte*/ decomposeFCD(char[] src,
\r
2292 int start,int decompLimit,
\r
2294 int[] destIndexArr,
\r
2299 long /*unsigned int*/ norm32;
\r
2300 int reorderStartIndex;
\r
2302 int/*unsigned byte*/ prevCC;
\r
2303 DecomposeArgs args = new DecomposeArgs();
\r
2304 int destIndex = destIndexArr[0];
\r
2306 * canonically decompose [src..decompLimit[
\r
2308 * all characters in this range have some non-zero cc,
\r
2309 * directly or in decomposition,
\r
2310 * so that we do not need to check in the following for quick-check
\r
2313 * there _are_ _no_ Hangul syllables or Jamos in here because they are
\r
2314 * FCD-safe (cc==0)!
\r
2316 * we also do not need to check for c==0 because we have an established
\r
2319 reorderStartIndex=destIndex;
\r
2322 while(start<decompLimit) {
\r
2324 norm32=getNorm32(c);
\r
2325 if(isNorm32Regular(norm32)) {
\r
2330 * reminder: this function is called with [src..decompLimit[
\r
2331 * not containing any Hangul/Jamo characters,
\r
2332 * therefore the only specials are lead surrogates
\r
2334 /* c is a lead surrogate, get the real norm32 */
\r
2335 if(start!=decompLimit && UTF16.isTrailSurrogate(c2=src[start])){
\r
2338 norm32=getNorm32FromSurrogatePair(norm32, c2);
\r
2346 /* get the decomposition and the lead and trail cc's */
\r
2347 if(nx_contains(nx, c, c2)) {
\r
2348 /* excluded: norm32==0 */
\r
2349 args.cc=args.trailCC=0;
\r
2351 } else if((norm32&QC_NFD)==0) {
\r
2352 /* c does not decompose */
\r
2353 args.cc=args.trailCC=(int)((UNSIGNED_BYTE_MASK)&
\r
2354 (norm32>>CC_SHIFT));
\r
2357 /* c decomposes, get everything from the variable-length extra
\r
2359 pStart=decompose(norm32, args);
\r
2361 if(args.length==1) {
\r
2362 /* fastpath a single code unit from decomposition */
\r
2369 /* append the decomposition to the destination buffer, assume
\r
2371 if((destIndex+args.length)<=dest.length) {
\r
2372 int reorderSplit=destIndex;
\r
2374 /* fastpath: single code point */
\r
2375 if(args.cc!=0 && args.cc<prevCC) {
\r
2376 /* (c, c2) is out of order with respect to the preceding
\r
2378 destIndex+=args.length;
\r
2379 args.trailCC=insertOrdered(dest,reorderStartIndex,
\r
2380 reorderSplit, destIndex,
\r
2383 /* just append (c, c2) */
\r
2384 dest[destIndex++]=c;
\r
2386 dest[destIndex++]=c2;
\r
2390 /* general: multiple code points (ordered by themselves)
\r
2391 * from decomposition */
\r
2392 if(args.cc!=0 && args.cc<prevCC) {
\r
2393 /* the decomposition is out of order with respect to
\r
2394 * the preceding text */
\r
2395 destIndex+=args.length;
\r
2396 args.trailCC=mergeOrdered(dest,reorderStartIndex,
\r
2397 reorderSplit, p, pStart,
\r
2398 pStart+args.length);
\r
2400 /* just append the decomposition */
\r
2402 dest[destIndex++]=p[pStart++];
\r
2403 } while(--args.length>0);
\r
2407 /* buffer overflow */
\r
2408 /* keep incrementing the destIndex for preflighting */
\r
2409 destIndex+=args.length;
\r
2412 prevCC=args.trailCC;
\r
2414 reorderStartIndex=destIndex;
\r
2417 destIndexArr[0]=destIndex;
\r
2421 public static int makeFCD(char[] src, int srcStart, int srcLimit,
\r
2422 char[] dest, int destStart, int destLimit,
\r
2425 int prevSrc, decompStart;
\r
2426 int destIndex, length;
\r
2428 int /* unsigned int*/ fcd16;
\r
2432 decompStart=srcStart;
\r
2433 destIndex=destStart;
\r
2437 int[] destIndexArr = new int[1];
\r
2438 destIndexArr[0]=destIndex;
\r
2441 /* skip a run of code units below the minimum or with irrelevant
\r
2442 * data for the FCD check */
\r
2446 if(srcStart==srcLimit) {
\r
2448 } else if((c=src[srcStart])<MIN_WITH_LEAD_CC) {
\r
2450 } else if((fcd16=getFCD16(c))==0) {
\r
2460 * prevCC has values from the following ranges:
\r
2461 * 0..0xff - the previous trail combining class
\r
2462 * <0 - the negative value of the previous code unit;
\r
2463 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its
\r
2465 * was deferred so that average text is checked faster
\r
2468 /* copy these code units all at once */
\r
2469 if(srcStart!=prevSrc) {
\r
2470 length=(int)(srcStart-prevSrc);
\r
2471 if((destIndex+length)<=destLimit) {
\r
2472 System.arraycopy(src,prevSrc,dest,destIndex,length);
\r
2474 destIndex+=length;
\r
2477 /* prevCC<0 is only possible from the above loop, i.e., only if
\r
2480 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we
\r
2481 * need to get its trail cc */
\r
2482 if(!nx_contains(nx, (int)-prevCC)) {
\r
2483 prevCC=(int)(getFCD16((int)-prevCC)&0xff);
\r
2485 prevCC=0; /* excluded: fcd16==0 */
\r
2488 * set a pointer to this below-U+0300 character;
\r
2489 * if prevCC==0 then it will moved to after this character
\r
2492 decompStart=prevSrc-1;
\r
2497 * prevSrc==src - used later to adjust destIndex before
\r
2502 /* end of source reached? */
\r
2503 if(srcStart==srcLimit) {
\r
2507 /* set a pointer to after the last source position where prevCC==0*/
\r
2509 decompStart=prevSrc;
\r
2512 /* c already contains *src and fcd16 is set for it, increment src */
\r
2515 /* check one above-minimum, relevant code unit */
\r
2516 if(UTF16.isLeadSurrogate(c)) {
\r
2517 /* c is a lead surrogate, get the real fcd16 */
\r
2518 if(srcStart!=srcLimit &&
\r
2519 UTF16.isTrailSurrogate(c2=src[srcStart])) {
\r
2521 fcd16=getFCD16FromSurrogatePair((char)fcd16, c2);
\r
2530 /* we are looking at the character (c, c2) at [prevSrc..src[ */
\r
2531 if(nx_contains(nx, c, c2)) {
\r
2532 fcd16=0; /* excluded: fcd16==0 */
\r
2534 /* check the combining order, get the lead cc */
\r
2535 cc=(int)(fcd16>>8);
\r
2536 if(cc==0 || cc>=prevCC) {
\r
2537 /* the order is ok */
\r
2539 decompStart=prevSrc;
\r
2541 prevCC=(int)(fcd16&0xff);
\r
2543 /* just append (c, c2) */
\r
2544 length= c2==0 ? 1 : 2;
\r
2545 if((destIndex+length)<=destLimit) {
\r
2546 dest[destIndex++]=c;
\r
2548 dest[destIndex++]=c2;
\r
2551 destIndex+=length;
\r
2555 * back out the part of the source that we copied already but
\r
2556 * is now going to be decomposed;
\r
2557 * prevSrc is set to after what was copied
\r
2559 destIndex-=(int)(prevSrc-decompStart);
\r
2562 * find the part of the source that needs to be decomposed;
\r
2563 * to be safe and simple, decompose to before the next character
\r
2566 srcStart=findSafeFCD(src,srcStart, srcLimit, (char)fcd16);
\r
2569 * the source text does not fulfill the conditions for FCD;
\r
2570 * decompose and reorder a limited piece of the text
\r
2572 destIndexArr[0] = destIndex;
\r
2573 prevCC=decomposeFCD(src,decompStart, srcStart,dest,
\r
2575 decompStart=srcStart;
\r
2576 destIndex=destIndexArr[0];
\r
2580 return destIndex - destStart;
\r
2584 public static int getCombiningClass(int c) {
\r
2586 norm32=getNorm32(c);
\r
2587 return (char)((norm32>>CC_SHIFT)&0xFF);
\r
2590 public static boolean isFullCompositionExclusion(int c) {
\r
2591 if(isFormatVersion_2_1) {
\r
2592 int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
\r
2593 return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
\r
2599 public static boolean isCanonSafeStart(int c) {
\r
2600 if(isFormatVersion_2_1) {
\r
2601 int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
\r
2602 return (boolean)((aux & AUX_UNSAFE_MASK)==0);
\r
2608 public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
\r
2610 if(fillSet!=null && canonStartSets!=null) {
\r
2612 * binary search for c
\r
2614 * There are two search tables,
\r
2615 * one for BMP code points and one for supplementary ones.
\r
2616 * See unormimp.h for details.
\r
2619 int i=0, start, limit;
\r
2621 int[] idxs = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
\r
2622 char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
\r
2625 table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
\r
2627 limit=table.length;
\r
2629 /* each entry is a pair { c, result } */
\r
2630 while(start<limit-2) {
\r
2631 i=(char)(((start+limit)/4)*2);
\r
2638 //System.out.println(i);
\r
2640 if(c==table[start]) {
\r
2642 if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
\r
2643 /* result 01xxxxxx xxxxxx contains index x to a
\r
2644 * USerializedSet */
\r
2645 i&=(CANON_SET_MAX_CANON_SETS-1);
\r
2646 return fillSet.getSet(startSets,(i-idxs.length));
\r
2648 /* other result values are BMP code points for
\r
2649 * single-code point sets */
\r
2650 fillSet.setToOne(i);
\r
2655 char high, low, h,j=0;
\r
2657 table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
\r
2659 limit=table.length;
\r
2661 high=(char)(c>>16);
\r
2664 /* each entry is a triplet { high(c), low(c), result } */
\r
2665 while(start<limit-3) {
\r
2666 /* (start+limit)/2 and address triplets */
\r
2667 i=(char)(((start+limit)/6)*3);
\r
2668 j=(char)(table[i]&0x1f); /* high word */
\r
2669 int tableVal = table[i+1];
\r
2671 if(high<j || ((tableVal>lowInt) && (high==j))) {
\r
2677 //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
\r
2679 // KLUDGE: IBM JIT in 1.4.0 is sooo broken
\r
2680 // The below lines make TestExhaustive pass
\r
2681 if(ICUDebug.enabled()){
\r
2682 System.err.println("\t\t j = " + Utility.hex(j,4) +
\r
2683 "\t i = " + Utility.hex(i,4) +
\r
2684 "\t high = "+ Utility.hex(high) +
\r
2685 "\t low = " + Utility.hex(lowInt,4) +
\r
2686 "\t table[i+1]: "+ Utility.hex(tableVal,4)
\r
2695 //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
\r
2696 int tableVal1 = table[start+1];
\r
2699 if(high==(h&0x1f) && lowInt==tableVal1) {
\r
2700 int tableVal2 = table[start+2];
\r
2702 if((h&0x8000)==0) {
\r
2703 /* the result is an index to a USerializedSet */
\r
2704 return fillSet.getSet(startSets,(i-idxs.length));
\r
2707 * single-code point set {x} in
\r
2708 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
\r
2710 //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
\r
2711 int temp = ((int)h & 0x1f00)<<8;
\r
2712 i|=temp; /* add high bits from high(c) */
\r
2713 fillSet.setToOne((int)i);
\r
2720 return false; /* not found */
\r
2723 public static int getFC_NFKC_Closure(int c, char[] dest) {
\r
2730 destCapacity = dest.length;
\r
2733 int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
\r
2735 aux&= AUX_FNC_MASK;
\r
2741 s =extraData[index];
\r
2743 /* s points to the single-unit string */
\r
2749 if(0<length && length<=destCapacity) {
\r
2750 System.arraycopy(extraData,index,dest,0,length);
\r
2759 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
\r
2760 public static boolean isNFSkippable(int c, Normalizer.Mode mode, long mask) {
\r
2761 long /*unsigned int*/ norm32;
\r
2762 mask = mask & UNSIGNED_INT_MASK;
\r
2765 /* check conditions (a)..(e), see unormimp.h */
\r
2766 norm32 = getNorm32(c);
\r
2768 if((norm32&mask)!=0) {
\r
2769 return false; /* fails (a)..(e), not skippable */
\r
2772 if(mode == Normalizer.NFD || mode == Normalizer.NFKD || mode == Normalizer.NONE){
\r
2773 return true; /* NF*D, passed (a)..(c), is skippable */
\r
2775 /* check conditions (a)..(e), see unormimp.h */
\r
2777 /* NF*C/FCC, passed (a)..(e) */
\r
2778 if((norm32& QC_NFD)==0) {
\r
2779 return true; /* no canonical decomposition, is skippable */
\r
2782 /* check Hangul syllables algorithmically */
\r
2783 if(isNorm32HangulOrJamo(norm32)) {
\r
2784 /* Jamo passed (a)..(e) above, must be Hangul */
\r
2785 return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
\r
2788 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
\r
2789 /* NF*C, test (f) flag */
\r
2790 if(!isFormatVersion_2_2) {
\r
2791 return false; /* no (f) data, say not skippable to be safe */
\r
2795 aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
\r
2796 return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
\r
2798 /* } else { FCC, test fcd<=1 instead of the above } */
\r
2802 private static final boolean
\r
2803 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
\r
2804 // add the start code point to the USet
\r
2805 uset_add((USet *)context, start);
\r
2810 public static UnicodeSet addPropertyStarts(UnicodeSet set) {
\r
2813 /* add the start code point of each same-value range of each trie */
\r
2814 //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
\r
2815 TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
\r
2816 RangeValueIterator.Element normResult = new RangeValueIterator.Element();
\r
2818 while(normIter.next(normResult)){
\r
2819 set.add(normResult.start);
\r
2822 //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
\r
2823 TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie);
\r
2824 RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();
\r
2826 while(fcdIter.next(fcdResult)){
\r
2827 set.add(fcdResult.start);
\r
2830 if(isFormatVersion_2_1){
\r
2831 //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
\r
2832 TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
\r
2833 RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
\r
2834 while(auxIter.next(auxResult)){
\r
2835 set.add(auxResult.start);
\r
2838 /* add Hangul LV syllables and LV+1 because of skippables */
\r
2839 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
\r
2843 set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
\r
2844 return set; // for chaining
\r
2848 * Internal API, used in UCharacter.getIntPropertyValue().
\r
2850 * @param c code point
\r
2851 * @param modeValue numeric value compatible with Mode
\r
2852 * @return numeric value compatible with QuickCheck
\r
2854 public static final int quickCheck(int c, int modeValue) {
\r
2855 final int qcMask[/*UNORM_MODE_COUNT*/]={
\r
2856 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
\r
2859 int norm32=(int)getNorm32(c)&qcMask[modeValue];
\r
2863 } else if((norm32&QC_ANY_NO)!=0) {
\r
2865 } else /* _NORM_QC_ANY_MAYBE */ {
\r
2866 return 2; // MAYBE;
\r
2871 * Internal API, used by collation code.
\r
2872 * Get access to the internal FCD trie table to be able to perform
\r
2873 * incremental, per-code unit, FCD checks in collation.
\r
2874 * One pointer is sufficient because the trie index values are offset
\r
2875 * by the index size, so that the same pointer is used to access the trie
\r
2880 public CharTrie getFCDTrie(){
\r
2881 return FCDTrieImpl.fcdTrie;
\r
2887 /* compare canonically equivalent ---------------------------------------- */
\r
2890 * Compare two strings for canonical equivalence.
\r
2891 * Further options include case-insensitive comparison and
\r
2892 * code point order (as opposed to code unit order).
\r
2894 * In this function, canonical equivalence is optional as well.
\r
2895 * If canonical equivalence is tested, then both strings must fulfill
\r
2898 * Semantically, this is equivalent to
\r
2899 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
\r
2900 * where code point order, NFD and foldCase are all optional.
\r
2902 * String comparisons almost always yield results before processing both
\r
2903 * strings completely.
\r
2904 * They are generally more efficient working incrementally instead of
\r
2905 * performing the sub-processing (strlen, normalization, case-folding)
\r
2906 * on the entire strings first.
\r
2908 * It is also unnecessary to not normalize identical characters.
\r
2910 * This function works in principle as follows:
\r
2913 * get one code unit c1 from s1 (-1 if end of source)
\r
2914 * get one code unit c2 from s2 (-1 if end of source)
\r
2916 * if(either string finished) {
\r
2924 * try to decompose/case-fold c1/c2, and continue if one does;
\r
2926 * // still c1!=c2 and neither decomposes/case-folds, return result
\r
2930 * When a character decomposes, then the pointer for that source changes to
\r
2931 * the decomposition, pushing the previous pointer onto a stack.
\r
2932 * When the end of the decomposition is reached, then the code unit reader
\r
2933 * pops the previous source from the stack.
\r
2934 * (Same for case-folding.)
\r
2936 * This is complicated further by operating on variable-width UTF-16.
\r
2937 * The top part of the loop works on code units, while lookups for decomposition
\r
2938 * and case-folding need code points.
\r
2939 * Code points are assembled after the equality/end-of-source part.
\r
2940 * The source pointer is only advanced beyond all code units when the code point
\r
2941 * actually decomposes/case-folds.
\r
2943 * If we were on a trail surrogate unit when assembling a code point,
\r
2944 * and the code point decomposes/case-folds, then the decomposition/folding
\r
2945 * result must be compared with the part of the other string that corresponds to
\r
2946 * this string's lead surrogate.
\r
2947 * Since we only assemble a code point when hitting a trail unit when the
\r
2948 * preceding lead units were identical, we back up the other string by one unit
\r
2951 * The optional code point order comparison at the end works with
\r
2952 * the same fix-up as the other code point order comparison functions.
\r
2953 * See ustring.c and the comment near the end of this function.
\r
2955 * Assumption: A decomposition or case-folding result string never contains
\r
2956 * a single surrogate. This is a safe assumption in the Unicode Standard.
\r
2957 * Therefore, we do not need to check for surrogate pairs across
\r
2958 * decomposition/case-folding boundaries.
\r
2959 * Further assumptions (see verifications tstnorm.cpp):
\r
2960 * The API function checks for FCD first, while the core function
\r
2961 * first case-folds and then decomposes. This requires that case-folding does not
\r
2962 * un-FCD any strings.
\r
2964 * The API function may also NFD the input and turn off decomposition.
\r
2965 * This requires that case-folding does not un-NFD strings either.
\r
2967 * TODO If any of the above two assumptions is violated,
\r
2968 * then this entire code must be re-thought.
\r
2969 * If this happens, then a simple solution is to case-fold both strings up front
\r
2970 * and to turn off UNORM_INPUT_IS_FCD.
\r
2971 * We already do this when not both strings are in FCD because makeFCD
\r
2972 * would be a partial NFD before the case folding, which does not work.
\r
2973 * Note that all of this is only a problem when case-folding _and_
\r
2974 * canonical equivalence come together.
\r
2976 * This function could be moved to a different source file, at increased cost
\r
2977 * for calling the decomposition access function.
\r
2980 // stack element for previous-level source/decomposition pointers
\r
2981 private static class CmpEquivLevel {
\r
2989 * Get the canonical decomposition for one code point.
\r
2990 * @param c code point
\r
2991 * @param buffer out-only buffer for algorithmic decompositions of Hangul
\r
2992 * @param length out-only, takes the length of the decomposition, if any
\r
2993 * @return index into the extraData array, or 0 if none
\r
2996 private static int decompose(int c, char[] buffer) {
\r
3000 norm32 = (long) ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie.getCodePointValue(c));
\r
3001 if((norm32 & QC_NFD)!=0) {
\r
3002 if(isNorm32HangulOrJamo(norm32)) {
\r
3003 /* Hangul syllable: decompose algorithmically */
\r
3008 c2=(char)(c%JAMO_T_COUNT);
\r
3011 buffer[2]=(char)(JAMO_T_BASE+c2);
\r
3016 buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
\r
3017 buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
\r
3020 /* normal decomposition */
\r
3021 DecomposeArgs args = new DecomposeArgs();
\r
3022 int index = decompose(norm32, args);
\r
3023 System.arraycopy(extraData,index,buffer,0,args.length);
\r
3024 return args.length ;
\r
3031 private static int foldCase(int c, char[] dest, int destStart, int destLimit,
\r
3033 String src = UTF16.valueOf(c);
\r
3034 String foldedStr = UCharacter.foldCase(src,options);
\r
3035 char[] foldedC = foldedStr.toCharArray();
\r
3036 for(int i=0;i<foldedC.length;i++){
\r
3037 if(destStart<destLimit){
\r
3038 dest[destStart]=foldedC[i];
\r
3040 // always increment destStart so that we can return
\r
3041 // the required length
\r
3044 return (c==UTF16.charAt(foldedStr,0)) ? -destStart : destStart;
\r
3048 private static int foldCase(char[] src,int srcStart,int srcLimit,
\r
3049 char[] dest, int destStart, int destLimit,
\r
3051 String source =new String(src,srcStart,(srcLimit-srcStart));
\r
3052 String foldedStr = UCharacter.foldCase(source,options);
\r
3053 char[] foldedC = foldedStr.toCharArray();
\r
3054 for(int i=0;i<foldedC.length;i++){
\r
3055 if(destStart<destLimit){
\r
3056 dest[destStart]=foldedC[i];
\r
3058 // always increment destStart so that we can return
\r
3059 // the required length
\r
3066 public static int cmpEquivFold(String s1, String s2,int options){
\r
3067 if ((options & Normalizer.COMPARE_IGNORE_CASE) !=0) {
\r
3068 return cmpSimpleEquivFold(s1, s2, options);
\r
3071 return cmpEquivFold(s1.toCharArray(),0,s1.length(),
\r
3072 s2.toCharArray(),0,s2.length(),
\r
3078 private static int cmpSimpleEquivFold(String s1, String s2, int options) {
\r
3081 String foldS1=null;
\r
3082 String foldS2=null;
\r
3085 while ((i+offset1<=s1.length() && j+offset2<=s2.length())) {
\r
3086 if ((cmp!=0) || (s1.charAt(i) != s2.charAt(j))) {
\r
3088 (UTF16.isLeadSurrogate((char)s1.charAt(i-1)) ||
\r
3089 UTF16.isLeadSurrogate((char)s2.charAt(j-1)))) {
\r
3090 // Current codepoint may be the low surrogate pair.
\r
3091 return cmpEquivFold(s1.toCharArray(),i-1,s1.length(),
\r
3092 s2.toCharArray(),j-1,s2.length(),
\r
3095 else if (UTF16.isLeadSurrogate((char)s1.charAt(i))||
\r
3096 UTF16.isLeadSurrogate((char)s2.charAt(j))) {
\r
3097 return cmpEquivFold(s1.toCharArray(),i,s1.length(),
\r
3098 s2.toCharArray(),j,s2.length(),
\r
3102 if ( offset1 > 0 ) {
\r
3103 foldS1 = UCharacter.foldCase(s1.substring(i, i+offset1),options);
\r
3105 if ( offset2 > 0 ) {
\r
3106 foldS2 = UCharacter.foldCase(s2.substring(j, j+offset2),options);
\r
3108 cmp = foldS1.compareTo(foldS2);
\r
3110 i = moveToNext(i, offset1);
\r
3111 j = moveToNext(j, offset2);
\r
3112 offset1 = offset2 = 1;
\r
3116 if (foldS1.length()==foldS2.length()) {
\r
3119 if (foldS1.length()<foldS2.length()) {
\r
3135 if (i+offset1-1==s1.length()) {
\r
3136 if (j+offset2-1==s2.length()) {
\r
3148 private static int moveToNext(int pos, int offset) {
\r
3150 return pos+offset;
\r
3158 // internal function
\r
3159 public static int cmpEquivFold(char[] s1, int s1Start,int s1Limit,
\r
3160 char[] s2, int s2Start,int s2Limit,
\r
3162 // current-level start/limit - s1/s2 as current
\r
3163 int start1, start2, limit1, limit2;
\r
3164 char[] cSource1, cSource2;
\r
3168 // decomposition variables
\r
3171 // stacks of previous-level start/current/limit
\r
3172 CmpEquivLevel[] stack1 = new CmpEquivLevel[]{
\r
3173 new CmpEquivLevel(),
\r
3174 new CmpEquivLevel()
\r
3176 CmpEquivLevel[] stack2 = new CmpEquivLevel[]{
\r
3177 new CmpEquivLevel(),
\r
3178 new CmpEquivLevel()
\r
3181 // decomposition buffers for Hangul
\r
3182 char[] decomp1 = new char[8];
\r
3183 char[] decomp2 = new char[8];
\r
3185 // case folding buffers, only use current-level start/limit
\r
3186 char[] fold1 = new char[32];
\r
3187 char[] fold2 = new char[32];
\r
3189 // track which is the current level per string
\r
3190 int level1, level2;
\r
3192 // current code units, and code points for lookups
\r
3196 // no argument error checking because this itself is not an API
\r
3198 // assume that at least one of the options COMPARE_EQUIV and
\r
3199 // COMPARE_IGNORE_CASE is set
\r
3200 // otherwise this function must behave exactly as uprv_strCompare()
\r
3201 // not checking for that here makes testing this function easier
\r
3214 // comparison loop
\r
3216 // here a code unit value of -1 means "get another code unit"
\r
3217 // below it will mean "this source is finished"
\r
3220 // get next code unit from string 1, post-increment
\r
3222 if(s1Start>=limit1) {
\r
3228 c1=cSource1[s1Start];
\r
3233 // reached end of level buffer, pop one level
\r
3236 start1=stack1[level1].start;
\r
3237 } while(start1==-1); //###### check this
\r
3238 s1Start=stack1[level1].s;
\r
3239 limit1=stack1[level1].limit;
\r
3240 cSource1=stack1[level1].source;
\r
3245 // get next code unit from string 2, post-increment
\r
3247 if(s2Start>=limit2) {
\r
3253 c2=cSource2[s2Start];
\r
3258 // reached end of level buffer, pop one level
\r
3261 start2=stack2[level2].start;
\r
3262 } while(start2==-1);
\r
3263 s2Start=stack2[level2].s;
\r
3264 limit2=stack2[level2].limit;
\r
3265 cSource2=stack2[level2].source;
\r
3269 // compare c1 and c2
\r
3270 // either variable c1, c2 is -1 only if the corresponding string
\r
3274 return 0; // c1==c2==-1 indicating end of strings
\r
3276 c1=c2=-1; // make us fetch new code units
\r
3279 return -1; // string 1 ends before string 2
\r
3281 return 1; // string 2 ends before string 1
\r
3283 // c1!=c2 && c1>=0 && c2>=0
\r
3285 // get complete code points for c1, c2 for lookups if either is a
\r
3288 if(UTF16.isSurrogate((char)c1)) {
\r
3291 if(UTF16.isLeadSurrogate((char)c1)) {
\r
3292 if( s1Start!=limit1 &&
\r
3293 UTF16.isTrailSurrogate(c=cSource1[s1Start])
\r
3295 // advance ++s1; only below if cp1 decomposes/case-folds
\r
3296 cp1=UCharacterProperty.getRawSupplementary((char)c1, c);
\r
3298 } else /* isTrail(c1) */ {
\r
3299 if( start1<=(s1Start-2) &&
\r
3300 UTF16.isLeadSurrogate(c=cSource1[(s1Start-2)])
\r
3302 cp1=UCharacterProperty.getRawSupplementary(c, (char)c1);
\r
3307 if(UTF16.isSurrogate((char)c2)) {
\r
3310 if(UTF16.isLeadSurrogate((char)c2)) {
\r
3311 if( s2Start!=limit2 &&
\r
3312 UTF16.isTrailSurrogate(c=cSource2[s2Start])
\r
3314 // advance ++s2; only below if cp2 decomposes/case-folds
\r
3315 cp2=UCharacterProperty.getRawSupplementary((char)c2, c);
\r
3317 } else /* isTrail(c2) */ {
\r
3318 if( start2<=(s2Start-2) &&
\r
3319 UTF16.isLeadSurrogate(c=cSource2[s2Start-2])
\r
3321 cp2=UCharacterProperty.getRawSupplementary(c, (char)c2);
\r
3326 // go down one level for each string
\r
3327 // continue with the main loop as soon as there is a real change
\r
3328 if( level1<2 && ((options & Normalizer.COMPARE_IGNORE_CASE)!=0)&&
\r
3329 (length=foldCase(cp1, fold1, 0,32,options))>=0
\r
3331 // cp1 case-folds to fold1[length]
\r
3332 if(UTF16.isSurrogate((char)c1)) {
\r
3333 if(UTF16.isLeadSurrogate((char)c1)) {
\r
3334 // advance beyond source surrogate pair if it
\r
3337 } else /* isTrail(c1) */ {
\r
3338 // we got a supplementary code point when hitting its
\r
3339 // trail surrogate, therefore the lead surrogate must
\r
3340 // have been the same as in the other string;
\r
3341 // compare this decomposition with the lead surrogate
\r
3342 // in the other string
\r
3344 c2=cSource2[(s2Start-1)];
\r
3348 // push current level pointers
\r
3349 stack1[0].start=start1;
\r
3350 stack1[0].s=s1Start;
\r
3351 stack1[0].limit=limit1;
\r
3352 stack1[0].source=cSource1;
\r
3359 // get ready to read from decomposition, continue with loop
\r
3364 if( level2<2 && ((options& Normalizer.COMPARE_IGNORE_CASE)!=0) &&
\r
3365 (length=foldCase(cp2, fold2,0,32, options))>=0
\r
3367 // cp2 case-folds to fold2[length]
\r
3368 if(UTF16.isSurrogate((char)c2)) {
\r
3369 if(UTF16.isLeadSurrogate((char)c2)) {
\r
3370 // advance beyond source surrogate pair if it
\r
3373 } else /* isTrail(c2) */ {
\r
3374 // we got a supplementary code point when hitting its
\r
3375 // trail surrogate, therefore the lead surrogate must
\r
3376 // have been the same as in the other string;
\r
3377 // compare this decomposition with the lead surrogate
\r
3378 // in the other string
\r
3380 c1=cSource1[(s1Start-1)];
\r
3384 // push current level pointers
\r
3385 stack2[0].start=start2;
\r
3386 stack2[0].s=s2Start;
\r
3387 stack2[0].limit=limit2;
\r
3388 stack2[0].source=cSource2;
\r
3395 // get ready to read from decomposition, continue with loop
\r
3400 if( level1<2 && ((options&COMPARE_EQUIV)!=0) &&
\r
3401 0!=(length=decompose(cp1,decomp1))
\r
3403 // cp1 decomposes into p[length]
\r
3404 if(UTF16.isSurrogate((char)c1)) {
\r
3405 if(UTF16.isLeadSurrogate((char)c1)) {
\r
3406 // advance beyond source surrogate pair if it
\r
3409 } else /* isTrail(c1) */ {
\r
3410 // we got a supplementary code point when hitting
\r
3411 // its trail surrogate, therefore the lead surrogate
\r
3412 // must have been the same as in the other string;
\r
3413 // compare this decomposition with the lead surrogate
\r
3414 // in the other string
\r
3416 c2=cSource2[(s2Start-1)];
\r
3420 // push current level pointers
\r
3421 stack1[level1].start=start1;
\r
3422 stack1[level1].s=s1Start;
\r
3423 stack1[level1].limit=limit1;
\r
3424 stack1[level1].source=cSource1;
\r
3427 // set next level pointers to decomposition
\r
3428 cSource1 = decomp1;
\r
3432 // set empty intermediate level if skipped
\r
3434 stack1[level1++].start=-1;
\r
3436 // get ready to read from decomposition, continue with loop
\r
3441 if( level2<2 && ((options&COMPARE_EQUIV)!=0) &&
\r
3442 0!=(length=decompose(cp2, decomp2))
\r
3444 // cp2 decomposes into p[length]
\r
3445 if(UTF16.isSurrogate((char)c2)) {
\r
3446 if(UTF16.isLeadSurrogate((char)c2)) {
\r
3447 // advance beyond source surrogate pair if it
\r
3450 } else /* isTrail(c2) */ {
\r
3451 // we got a supplementary code point when hitting its
\r
3452 // trail surrogate, therefore the lead surrogate must
\r
3453 // have been the same as in the other string;
\r
3454 // compare this decomposition with the lead surrogate
\r
3455 // in the other string
\r
3457 c1=cSource1[(s1Start-1)];
\r
3461 // push current level pointers
\r
3462 stack2[level2].start=start2;
\r
3463 stack2[level2].s=s2Start;
\r
3464 stack2[level2].limit=limit2;
\r
3465 stack2[level2].source=cSource2;
\r
3468 // set next level pointers to decomposition
\r
3473 // set empty intermediate level if skipped
\r
3475 stack2[level2++].start=-1;
\r
3478 // get ready to read from decomposition, continue with loop
\r
3484 // no decomposition/case folding, max level for both sides:
\r
3485 // return difference result
\r
3487 // code point order comparison must not just return cp1-cp2
\r
3488 // because when single surrogates are present then the surrogate
\r
3489 // pairs that formed cp1 and cp2 may be from different string
\r
3492 // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at
\r
3493 // second code units
\r
3494 // c1=d800 cp1=10001 c2=dc00 cp2=10000
\r
3495 // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32
\r
3496 // it is { d800 10001 } < { 10000 }
\r
3497 // therefore fix-up
\r
3499 if( c1>=0xd800 && c2>=0xd800 &&
\r
3500 ((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0)
\r
3502 /* subtract 0x2800 from BMP code points to make them smaller
\r
3503 * than supplementary ones */
\r
3505 ( c1<=0xdbff && s1Start!=limit1
\r
3507 UTF16.isTrailSurrogate(cSource1[s1Start])
\r
3510 ( UTF16.isTrailSurrogate((char)c1) && start1!=(s1Start-1)
\r
3512 UTF16.isLeadSurrogate(cSource1[(s1Start-2)])
\r
3515 /* part of a surrogate pair, leave >=d800 */
\r
3517 /* BMP code point - may be surrogate code point -
\r
3523 ( c2<=0xdbff && s2Start!=limit2
\r
3525 UTF16.isTrailSurrogate(cSource2[s2Start])
\r
3528 ( UTF16.isTrailSurrogate((char)c2) && start2!=(s2Start-1)
\r
3530 UTF16.isLeadSurrogate(cSource2[(s2Start-2)])
\r
3533 /* part of a surrogate pair, leave >=d800 */
\r
3535 /* BMP code point - may be surrogate code point -
\r
3544 private static int strCompare(char[] s1, int s1Start, int s1Limit,
\r
3545 char[] s2, int s2Start, int s2Limit,
\r
3546 boolean codePointOrder) {
\r
3548 int start1, start2, limit1, limit2;
\r
3552 /* setup for fix-up */
\r
3556 int length1, length2;
\r
3558 length1 = s1Limit - s1Start;
\r
3559 length2 = s2Limit - s2Start;
\r
3563 if(length1<length2) {
\r
3565 limit1=start1+length1;
\r
3566 } else if(length1==length2) {
\r
3568 limit1=start1+length1;
\r
3569 } else /* length1>length2 */ {
\r
3571 limit1=start1+length2;
\r
3575 return lengthResult;
\r
3579 /* check pseudo-limit */
\r
3580 if(s1Start==limit1) {
\r
3581 return lengthResult;
\r
3593 /* setup for fix-up */
\r
3594 limit1=start1+length1;
\r
3595 limit2=start2+length2;
\r
3598 /* if both values are in or above the surrogate range, fix them up */
\r
3599 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
\r
3600 /* subtract 0x2800 from BMP code points to make them smaller than
\r
3601 * supplementary ones */
\r
3603 ( c1<=0xdbff && (s1Start+1)!=limit1 &&
\r
3604 UTF16.isTrailSurrogate(s1[(s1Start+1)])
\r
3606 ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
\r
3607 UTF16.isLeadSurrogate(s1[(s1Start-1)])
\r
3610 /* part of a surrogate pair, leave >=d800 */
\r
3612 /* BMP code point - may be surrogate code point - make <d800 */
\r
3617 ( c2<=0xdbff && (s2Start+1)!=limit2 &&
\r
3618 UTF16.isTrailSurrogate(s2[(s2Start+1)])
\r
3620 ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
\r
3621 UTF16.isLeadSurrogate(s2[(s2Start-1)])
\r
3624 /* part of a surrogate pair, leave >=d800 */
\r
3626 /* BMP code point - may be surrogate code point - make <d800 */
\r
3631 /* now c1 and c2 are in UTF-32-compatible order */
\r
3632 return (int)c1-(int)c2;
\r
3637 * Status of tailored normalization
\r
3639 * This was done initially for investigation on Unicode public review issue 7
\r
3640 * (http://www.unicode.org/review/). See Jitterbug 2481.
\r
3641 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
\r
3642 * a permanent feature in ICU 2.6 in support of IDNA which requires true
\r
3643 * Unicode 3.2 normalization.
\r
3644 * (NormalizationCorrections are rolled into IDNA mapping tables.)
\r
3646 * Tailored normalization as implemented here allows to "normalize less"
\r
3647 * than full Unicode normalization would.
\r
3648 * Based internally on a UnicodeSet of code points that are
\r
3649 * "excluded from normalization", the normalization functions leave those
\r
3650 * code points alone ("inert"). This means that tailored normalization
\r
3651 * still transforms text into a canonically equivalent form.
\r
3652 * It does not add decompositions to code points that do not have any or
\r
3653 * change decomposition results.
\r
3655 * Any function that searches for a safe boundary has not been touched,
\r
3656 * which means that these functions will be over-pessimistic when
\r
3657 * exclusions are applied.
\r
3658 * This should not matter because subsequent checks and normalizations
\r
3659 * do apply the exclusions; only a little more of the text may be processed
\r
3660 * than necessary under exclusions.
\r
3662 * Normalization exclusions have the following effect on excluded code points c:
\r
3663 * - c is not decomposed
\r
3664 * - c is not a composition target
\r
3665 * - c does not combine forward or backward for composition
\r
3666 * except that this is not implemented for Jamo
\r
3667 * - c is treated as having a combining class of 0
\r
3671 * Constants for the bit fields in the options bit set parameter.
\r
3672 * These need not be public.
\r
3673 * A user only needs to know the currently assigned values.
\r
3674 * The number and positions of reserved bits per field can remain private.
\r
3676 private static final int OPTIONS_NX_MASK=0x1f;
\r
3677 private static final int OPTIONS_UNICODE_MASK=0xe0;
\r
3678 public static final int OPTIONS_SETS_MASK=0xff;
\r
3679 //private static final int OPTIONS_UNICODE_SHIFT=5;
\r
3680 private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
\r
3682 /* Constants for options flags for normalization.*/
\r
3685 * Options bit 0, do not decompose Hangul syllables.
\r
3687 private static final int NX_HANGUL = 1;
\r
3689 * Options bit 1, do not decompose CJK compatibility characters.
\r
3691 private static final int NX_CJK_COMPAT=2;
\r
3693 * Options bit 8, use buggy recomposition described in
\r
3694 * Unicode Public Review Issue #29
\r
3695 * at http://www.unicode.org/review/resolved-pri.html#pri29
\r
3697 * Used in IDNA implementation according to strict interpretation
\r
3698 * of IDNA definition based on Unicode 3.2 which predates PRI #29.
\r
3700 * See ICU4C unormimp.h
\r
3702 public static final int BEFORE_PRI_29=0x100;
\r
3705 * The following options are used only in some composition functions.
\r
3706 * They use bits 12 and up to preserve lower bits for the available options
\r
3707 * space in unorm_compare() -
\r
3708 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
\r
3711 /** Options bit 12, for compatibility vs. canonical decomposition. */
\r
3712 public static final int OPTIONS_COMPAT=0x1000;
\r
3713 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
\r
3714 public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;
\r
3716 /* normalization exclusion sets --------------------------------------------- */
\r
3719 * Normalization exclusion UnicodeSets are used for tailored normalization;
\r
3720 * see the comment near the beginning of this file.
\r
3722 * By specifying one or several sets of code points,
\r
3723 * those code points become inert for normalization.
\r
3725 private static final synchronized UnicodeSet internalGetNXHangul() {
\r
3726 /* internal function, does not check for incoming U_FAILURE */
\r
3728 if(nxCache[NX_HANGUL]==null) {
\r
3729 nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
\r
3731 return nxCache[NX_HANGUL];
\r
3734 private static final synchronized UnicodeSet internalGetNXCJKCompat() {
\r
3735 /* internal function, does not check for incoming U_FAILURE */
\r
3737 if(nxCache[NX_CJK_COMPAT]==null) {
\r
3739 /* build a set from [CJK Ideographs]&[has canonical decomposition] */
\r
3740 UnicodeSet set, hasDecomp;
\r
3742 set=new UnicodeSet("[:Ideographic:]");
\r
3744 /* start with an empty set for [has canonical decomposition] */
\r
3745 hasDecomp=new UnicodeSet();
\r
3747 /* iterate over all ideographs and remember which canonically decompose */
\r
3748 UnicodeSetIterator it = new UnicodeSetIterator(set);
\r
3752 while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
\r
3753 start=it.codepoint;
\r
3754 end=it.codepointEnd;
\r
3755 while(start<=end) {
\r
3756 norm32 = getNorm32(start);
\r
3757 if((norm32 & QC_NFD)>0) {
\r
3758 hasDecomp.add(start);
\r
3764 /* hasDecomp now contains all ideographs that decompose canonically */
\r
3765 nxCache[NX_CJK_COMPAT]=hasDecomp;
\r
3769 return nxCache[NX_CJK_COMPAT];
\r
3772 private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
\r
3773 options &= OPTIONS_UNICODE_MASK;
\r
3778 if(nxCache[options]==null) {
\r
3779 /* build a set with all code points that were not designated by the specified Unicode version */
\r
3780 UnicodeSet set = new UnicodeSet();
\r
3783 case Normalizer.UNICODE_3_2:
\r
3784 set.applyPattern("[:^Age=3.2:]");
\r
3790 nxCache[options]=set;
\r
3793 return nxCache[options];
\r
3796 /* Get a decomposition exclusion set. The data must be loaded. */
\r
3797 private static final synchronized UnicodeSet internalGetNX(int options) {
\r
3798 options&=OPTIONS_SETS_MASK;
\r
3800 if(nxCache[options]==null) {
\r
3801 /* return basic sets */
\r
3802 if(options==NX_HANGUL) {
\r
3803 return internalGetNXHangul();
\r
3805 if(options==NX_CJK_COMPAT) {
\r
3806 return internalGetNXCJKCompat();
\r
3808 if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
\r
3809 return internalGetNXUnicode(options);
\r
3812 /* build a set from multiple subsets */
\r
3816 set=new UnicodeSet();
\r
3819 if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
\r
3820 set.addAll(other);
\r
3822 if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
\r
3823 set.addAll(other);
\r
3825 if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
\r
3826 set.addAll(other);
\r
3829 nxCache[options]=set;
\r
3831 return nxCache[options];
\r
3834 public static final UnicodeSet getNX(int options) {
\r
3835 if((options&=OPTIONS_SETS_MASK)==0) {
\r
3836 /* incoming failure, or no decomposition exclusions requested */
\r
3839 return internalGetNX(options);
\r
3843 private static final boolean nx_contains(UnicodeSet nx, int c) {
\r
3844 return nx!=null && nx.contains(c);
\r
3847 private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
\r
3848 return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
\r