]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/NormalizerImpl.java
icu4jsrc
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / impl / NormalizerImpl.java
1  /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2008, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7  \r
8 package com.ibm.icu.impl;\r
9 import java.io.ByteArrayInputStream;\r
10 import java.io.IOException;\r
11 import java.io.BufferedInputStream;\r
12 import java.io.InputStream;\r
13 import java.util.MissingResourceException;\r
14 \r
15 import com.ibm.icu.text.Normalizer;\r
16 import com.ibm.icu.text.UTF16;    \r
17 import com.ibm.icu.text.UnicodeSet;\r
18 import com.ibm.icu.text.UnicodeSetIterator;\r
19 import com.ibm.icu.util.RangeValueIterator;\r
20 import com.ibm.icu.util.VersionInfo;\r
21 import com.ibm.icu.lang.UCharacter;\r
22 \r
23 /**\r
24  * @version     1.0\r
25  * @author  Ram Viswanadha\r
26  */\r
27 public final class NormalizerImpl {\r
28     // Static block for the class to initialize its own self \r
29     static final NormalizerImpl IMPL;\r
30     \r
31     static\r
32     {\r
33         try\r
34         {\r
35             IMPL = new NormalizerImpl();\r
36         }\r
37         catch (Exception e)\r
38         {\r
39             throw new MissingResourceException(e.getMessage(), "", "");\r
40         }\r
41     }\r
42     \r
43     static final int UNSIGNED_BYTE_MASK =0xFF;\r
44     static final long UNSIGNED_INT_MASK = 0xffffffffL;\r
45     /*\r
46      * This new implementation of the normalization code loads its data from\r
47      * unorm.icu, which is generated with the gennorm tool.\r
48      * The format of that file is described at the end of this file.\r
49      */\r
50     private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";\r
51     \r
52     // norm32 value constants \r
53     \r
54     // quick check flags 0..3 set mean "no" for their forms \r
55     public static final int QC_NFC=0x11;          /* no|maybe */\r
56     public static final int QC_NFKC=0x22;         /* no|maybe */\r
57     public static final int QC_NFD=4;             /* no */\r
58     public static final int QC_NFKD=8;            /* no */\r
59     \r
60     public static final int QC_ANY_NO=0xf;\r
61 \r
62     /* quick check flags 4..5 mean "maybe" for their forms; \r
63      * test flags>=QC_MAYBE \r
64      */\r
65     public static final int QC_MAYBE=0x10;\r
66     public static final int QC_ANY_MAYBE=0x30;\r
67 \r
68     public static final int QC_MASK=0x3f;\r
69 \r
70     private static final int COMBINES_FWD=0x40;\r
71     private static final int COMBINES_BACK=0x80;\r
72     public  static final int COMBINES_ANY=0xc0;\r
73     // UnicodeData.txt combining class in bits 15.\r
74     private static final int CC_SHIFT=8;                     \r
75     public  static final int CC_MASK=0xff00;\r
76     // 16 bits for the index to UChars and other extra data\r
77     private static final int EXTRA_SHIFT=16;\r
78     // start of surrogate specials after shift                \r
79     //private static final int EXTRA_INDEX_TOP=0xfc00;       \r
80 \r
81     //private static final int EXTRA_SURROGATE_MASK=0x3ff;\r
82     //private static final int EXTRA_SURROGATE_TOP=0x3f0;    /* hangul etc. */\r
83 \r
84     //private static final int EXTRA_HANGUL=EXTRA_SURROGATE_TOP;\r
85     //private static final int EXTRA_JAMO_L=EXTRA_SURROGATE_TOP+1;/* ### not used */\r
86     //private static final int EXTRA_JAMO_V=EXTRA_SURROGATE_TOP+2;\r
87     //private static final int EXTRA_JAMO_T=EXTRA_SURROGATE_TOP+3;\r
88     \r
89     /* norm32 value constants using >16 bits */\r
90     private static final long  MIN_SPECIAL    =  (long)(0xfc000000 & UNSIGNED_INT_MASK);\r
91     private static final long  SURROGATES_TOP =  (long)(0xfff00000 & UNSIGNED_INT_MASK);\r
92     private static final long  MIN_HANGUL     =  (long)(0xfff00000 & UNSIGNED_INT_MASK);\r
93     //private static final long  MIN_JAMO_V     =  (long)(0xfff20000 & UNSIGNED_INT_MASK);\r
94     private static final long  JAMO_V_TOP     =  (long)(0xfff30000 & UNSIGNED_INT_MASK);\r
95     \r
96     \r
97     /* indexes[] value names */\r
98     /* number of bytes in normalization trie */\r
99     static final int INDEX_TRIE_SIZE           = 0;\r
100      /* number of chars in extra data */     \r
101     static final int INDEX_CHAR_COUNT           = 1;    \r
102     /* number of uint16_t words for combining data */\r
103     static final int INDEX_COMBINE_DATA_COUNT = 2;\r
104     /* number of code points that combine forward */     \r
105     static final int INDEX_COMBINE_FWD_COUNT  = 3;\r
106     /* number of code points that combine forward and backward */     \r
107     static final int INDEX_COMBINE_BOTH_COUNT = 4;\r
108     /* number of code points that combine backward */     \r
109     static final int INDEX_COMBINE_BACK_COUNT = 5;     \r
110      /* first code point with quick check NFC NO/MAYBE */\r
111     public static final int INDEX_MIN_NFC_NO_MAYBE   = 6;\r
112     /* first code point with quick check NFKC NO/MAYBE */    \r
113     public static final int INDEX_MIN_NFKC_NO_MAYBE  = 7;\r
114      /* first code point with quick check NFD NO/MAYBE */     \r
115     public static final int INDEX_MIN_NFD_NO_MAYBE   = 8;\r
116     /* first code point with quick check NFKD NO/MAYBE */    \r
117     public static final int INDEX_MIN_NFKD_NO_MAYBE  = 9;     \r
118     /* number of bytes in FCD trie */\r
119     static final int INDEX_FCD_TRIE_SIZE      = 10;\r
120     /* number of bytes in the auxiliary trie */    \r
121     static final int INDEX_AUX_TRIE_SIZE      = 11;\r
122     /* number of uint16_t in the array of serialized USet */    \r
123     static final int INDEX_CANON_SET_COUNT    = 12;    \r
124     /* changing this requires a new formatVersion */\r
125     static final int INDEX_TOP                = 32;    \r
126     \r
127     \r
128     /* AUX constants */\r
129     /* value constants for auxTrie */    \r
130     private static final int AUX_UNSAFE_SHIFT           = 11;\r
131     private static final int AUX_COMP_EX_SHIFT           = 10;\r
132     private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;\r
133     \r
134     private static final int AUX_MAX_FNC          =   ((int)1<<AUX_COMP_EX_SHIFT);\r
135     private static final int AUX_UNSAFE_MASK      =   (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);\r
136     private static final int AUX_FNC_MASK         =   (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);\r
137     private static final int AUX_COMP_EX_MASK     =   (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);\r
138     private static final long AUX_NFC_SKIP_F_MASK =   ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);\r
139     \r
140     /* canonStartSets[0..31] contains indexes for what is in the array */\r
141     /* number of uint16_t in canonical starter sets */\r
142     static final int SET_INDEX_CANON_SETS_LENGTH        = 0;\r
143     /* number of uint16_t in the BMP search table (contains pairs) */ \r
144     static final int SET_INDEX_CANON_BMP_TABLE_LENGTH    = 1;\r
145     /* number of uint16_t in the supplementary search table(contains triplets)*/ \r
146     static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH  = 2;\r
147     /* changing this requires a new formatVersion */ \r
148     static final int SET_INDEX_TOP                        = 32;\r
149     \r
150     static final int CANON_SET_INDICIES_INDEX              = 0;\r
151     static final int CANON_SET_START_SETS_INDEX            = 1;\r
152     static final int CANON_SET_BMP_TABLE_INDEX            = 2;\r
153     static final int CANON_SET_SUPP_TABLE_INDEX            = 3;\r
154     /* 14 bit indexes to canonical USerializedSets */\r
155     static final int CANON_SET_MAX_CANON_SETS             = 0x4000; \r
156     /* single-code point BMP sets are encoded directly in the search table \r
157      * except if result=0x4000..0x7fff \r
158      */\r
159     static final int CANON_SET_BMP_MASK                    = 0xc000;\r
160     static final int CANON_SET_BMP_IS_INDEX                = 0x4000;\r
161     \r
162     private static final int MAX_BUFFER_SIZE                    = 20;\r
163     \r
164     /**\r
165      * Internal option for cmpEquivFold() for decomposing.\r
166      * If not set, just do strcasecmp().\r
167      * @internal\r
168      */\r
169      public static final int COMPARE_EQUIV = 0x80000;\r
170     \r
171     /*******************************/\r
172 \r
173     /* Wrappers for Trie implementations */ \r
174     static final class NormTrieImpl implements Trie.DataManipulate{\r
175         static IntTrie normTrie= null;\r
176        /**\r
177         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's \r
178         * data the index array offset of the indexes for that lead surrogate.\r
179         * @param property data value for a surrogate from the trie, including \r
180         *         the folding offset\r
181         * @return data offset or 0 if there is no data for the lead surrogate\r
182         */\r
183         /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */\r
184         public int getFoldingOffset(int value){\r
185             return  BMP_INDEX_LENGTH+\r
186                     ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&\r
187                     (0x3ff<<SURROGATE_BLOCK_BITS)); \r
188         }\r
189         \r
190     }\r
191     static final class FCDTrieImpl implements Trie.DataManipulate{\r
192         static CharTrie fcdTrie=null;\r
193        /**\r
194         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's \r
195         * data the index array offset of the indexes for that lead surrogate.\r
196         * @param property data value for a surrogate from the trie, including\r
197         *         the folding offset\r
198         * @return data offset or 0 if there is no data for the lead surrogate\r
199         */\r
200         /* fcdTrie: the folding offset is the lead FCD value itself */\r
201         public int getFoldingOffset(int value){\r
202             return value;\r
203         }\r
204     }\r
205     \r
206     static final class AuxTrieImpl implements Trie.DataManipulate{\r
207         static CharTrie auxTrie = null;\r
208        /**\r
209         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's \r
210         * data the index array offset of the indexes for that lead surrogate.\r
211         * @param property data value for a surrogate from the trie, including \r
212         *        the folding offset\r
213         * @return data offset or 0 if there is no data for the lead surrogate\r
214         */\r
215         /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */\r
216         public int getFoldingOffset(int value){\r
217             return (int)(value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;\r
218         }\r
219     }\r
220          \r
221     /****************************************************/\r
222     \r
223     \r
224     private static FCDTrieImpl fcdTrieImpl;\r
225     private static NormTrieImpl normTrieImpl;\r
226     private static AuxTrieImpl auxTrieImpl;\r
227     private static int[] indexes;\r
228     private static char[] combiningTable;\r
229     private static char[] extraData;\r
230     private static Object[] canonStartSets;\r
231     \r
232     private static boolean isDataLoaded;\r
233     private static boolean isFormatVersion_2_1;\r
234     private static boolean isFormatVersion_2_2;\r
235     private static byte[] unicodeVersion;\r
236     \r
237     /**\r
238      * Default buffer size of datafile\r
239      */\r
240     private static final int DATA_BUFFER_SIZE = 25000;\r
241     \r
242     /**\r
243      * FCD check: everything below this code point is known to have a 0 \r
244      * lead combining class \r
245      */\r
246     public static final int MIN_WITH_LEAD_CC=0x300;\r
247 \r
248 \r
249     /**\r
250      * Bit 7 of the length byte for a decomposition string in extra data is\r
251      * a flag indicating whether the decomposition string is\r
252      * preceded by a 16-bit word with the leading and trailing cc\r
253      * of the decomposition (like for A-umlaut);\r
254      * if not, then both cc's are zero (like for compatibility ideographs).\r
255      */\r
256     private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;\r
257     /**\r
258      * Bits 6..0 of the length byte contain the actual length.\r
259      */\r
260     private static final int DECOMP_LENGTH_MASK=0x7f;   \r
261     \r
262     /** Length of the BMP portion of the index (stage 1) array. */\r
263     private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;\r
264     /** Number of bits of a trail surrogate that are used in index table \r
265      * lookups. \r
266      */\r
267     private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;\r
268 \r
269 \r
270    // public utility\r
271    public static int getFromIndexesArr(int index){\r
272         return indexes[index];\r
273    }\r
274    \r
275    // protected constructor ---------------------------------------------\r
276     \r
277     /**\r
278     * Constructor\r
279     * @exception thrown when data reading fails or data corrupted\r
280     */\r
281     private NormalizerImpl() throws IOException {\r
282         //data should be loaded only once\r
283         if(!isDataLoaded){\r
284             \r
285             // jar access\r
286             InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);\r
287             BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);\r
288             NormalizerDataReader reader = new NormalizerDataReader(b);\r
289             \r
290             // read the indexes            \r
291             indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);\r
292             \r
293             byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];\r
294             \r
295             int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];\r
296             combiningTable = new char[combiningTableTop];\r
297             \r
298             int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];\r
299             extraData = new char[extraDataTop];\r
300 \r
301             byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];\r
302             byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];\r
303             canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];\r
304             \r
305             fcdTrieImpl = new FCDTrieImpl();\r
306             normTrieImpl = new NormTrieImpl();\r
307             auxTrieImpl = new AuxTrieImpl();\r
308                         \r
309             // load the rest of the data data and initialize the data members\r
310             reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable, \r
311                         canonStartSets);\r
312                                        \r
313             NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );\r
314             FCDTrieImpl.fcdTrie   = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl  );\r
315             AuxTrieImpl.auxTrie   = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl  );\r
316             \r
317             // we reached here without any exceptions so the data is fully \r
318             // loaded set the variable to true\r
319             isDataLoaded = true;\r
320             \r
321             // get the data format version                           \r
322             byte[] formatVersion = reader.getDataFormatVersion();\r
323             \r
324             isFormatVersion_2_1 =( formatVersion[0]>2 \r
325                                     ||\r
326                                    (formatVersion[0]==2 && formatVersion[1]>=1)\r
327                                  );\r
328             isFormatVersion_2_2 =( formatVersion[0]>2 \r
329                                     ||\r
330                                    (formatVersion[0]==2 && formatVersion[1]>=2)\r
331                                  );\r
332             unicodeVersion = reader.getUnicodeVersion();\r
333             b.close();\r
334         }\r
335     }\r
336         \r
337     /* ---------------------------------------------------------------------- */\r
338     \r
339     /* Korean Hangul and Jamo constants */\r
340     \r
341     public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */\r
342     public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */\r
343     public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */\r
344     \r
345     public static final int HANGUL_BASE=0xac00;\r
346     \r
347     public static final int JAMO_L_COUNT=19;\r
348     public static final int JAMO_V_COUNT=21;\r
349     public static final int JAMO_T_COUNT=28;\r
350     public  static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;\r
351     \r
352     private static boolean isHangulWithoutJamoT(char c) {\r
353         c-=HANGUL_BASE;\r
354         return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;\r
355     }\r
356     \r
357     /* norm32 helpers */\r
358     \r
359     /* is this a norm32 with a regular index? */\r
360     private static boolean isNorm32Regular(long norm32) {\r
361         return norm32<MIN_SPECIAL;\r
362     }\r
363     \r
364     /* is this a norm32 with a special index for a lead surrogate? */\r
365     private static boolean isNorm32LeadSurrogate(long norm32) {\r
366         return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;\r
367     }\r
368     \r
369     /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */\r
370     private static boolean isNorm32HangulOrJamo(long norm32) {\r
371         return norm32>=MIN_HANGUL;\r
372     }\r
373     \r
374     /*\r
375      * Given isNorm32HangulOrJamo(),\r
376      * is this a Hangul syllable or a Jamo?\r
377      */\r
378 //    private static  boolean isHangulJamoNorm32HangulOrJamoL(long norm32) {\r
379 //        return norm32<MIN_JAMO_V;\r
380 //    }\r
381     \r
382     /*\r
383      * Given norm32 for Jamo V or T,\r
384      * is this a Jamo V?\r
385      */\r
386     private static boolean isJamoVTNorm32JamoV(long norm32) {\r
387         return norm32<JAMO_V_TOP;\r
388     }\r
389     \r
390     /* data access primitives ----------------------------------------------- */\r
391     \r
392     public static long/*unsigned*/ getNorm32(char c) {\r
393         return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));\r
394     }\r
395     \r
396     public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32, \r
397                                                                char c2) {\r
398         /*\r
399          * the surrogate index in norm32 stores only the number of the surrogate\r
400          * index block see gennorm/store.c/getFoldedNormValue()\r
401          */\r
402         return ((UNSIGNED_INT_MASK) & \r
403                     NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));\r
404     }\r
405     private static long getNorm32(int c){\r
406         return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));\r
407     }\r
408     \r
409 //    private static long getNorm32(int c,int mask){\r
410 //        long/*unsigned*/ norm32= getNorm32(UTF16.getLeadSurrogate(c));\r
411 //        if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {\r
412 //            /* c is a lead surrogate, get the real norm32 */\r
413 //            norm32=getNorm32FromSurrogatePair(norm32,UTF16.getTrailSurrogate(c));\r
414 //        }\r
415 //        return norm32; \r
416 //    }\r
417     \r
418     /*\r
419      * get a norm32 from text with complete code points\r
420      * (like from decompositions)\r
421      */\r
422     private static long/*unsigned*/ getNorm32(char[] p,int start,\r
423                                               int/*unsigned*/ mask) {\r
424         long/*unsigned*/ norm32= getNorm32(p[start]);\r
425         if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {\r
426             /* *p is a lead surrogate, get the real norm32 */\r
427             norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);\r
428         }\r
429         return norm32;\r
430     }\r
431     public static VersionInfo getUnicodeVersion(){\r
432         return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],\r
433                                        unicodeVersion[2], unicodeVersion[3]);\r
434     }\r
435     public static char    getFCD16(char c) {\r
436         return  FCDTrieImpl.fcdTrie.getLeadValue(c);\r
437     }\r
438     \r
439     public static char getFCD16FromSurrogatePair(char fcd16, char c2) {\r
440         /* the surrogate index in fcd16 is an absolute offset over the \r
441          * start of stage 1 \r
442          * */\r
443         return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);\r
444     }\r
445     public static int getFCD16(int c) {\r
446         return  FCDTrieImpl.fcdTrie.getCodePointValue(c);\r
447     }\r
448         \r
449     private static int getExtraDataIndex(long norm32) {\r
450         return (int)(norm32>>EXTRA_SHIFT);\r
451     }\r
452     \r
453     private static final class DecomposeArgs{\r
454         int /*unsigned byte*/ cc;\r
455         int /*unsigned byte*/ trailCC;\r
456         int length;\r
457     }\r
458     /**\r
459      * \r
460      * get the canonical or compatibility decomposition for one character \r
461      * \r
462      * @return index into the extraData array\r
463      */\r
464     private static int/*index*/ decompose(long/*unsigned*/ norm32, \r
465                                           int/*unsigned*/ qcMask, \r
466                                           DecomposeArgs args) {\r
467         int p= getExtraDataIndex(norm32);\r
468         args.length=extraData[p++];\r
469     \r
470         if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {\r
471             /* use compatibility decomposition, skip canonical data */\r
472             p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);\r
473             args.length>>=8;\r
474         }\r
475     \r
476         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {\r
477             /* get the lead and trail cc's */\r
478             char bothCCs=extraData[p++];\r
479             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);\r
480             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;\r
481         } else {\r
482             /* lead and trail cc's are both 0 */\r
483             args.cc=args.trailCC=0;\r
484         }\r
485     \r
486         args.length&=DECOMP_LENGTH_MASK;\r
487         return p;\r
488     }\r
489     \r
490        \r
491     /**\r
492      * get the canonical decomposition for one character \r
493      * @return index into the extraData array\r
494      */\r
495     private static int decompose(long/*unsigned*/ norm32, \r
496                                  DecomposeArgs args) {\r
497                              \r
498         int p= getExtraDataIndex(norm32);\r
499         args.length=extraData[p++];\r
500     \r
501         if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {\r
502             /* get the lead and trail cc's */\r
503             char bothCCs=extraData[p++];\r
504             args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);\r
505             args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;\r
506         } else {\r
507             /* lead and trail cc's are both 0 */\r
508             args.cc=args.trailCC=0;\r
509         }\r
510     \r
511         args.length&=DECOMP_LENGTH_MASK;\r
512         return p;\r
513     }\r
514     \r
515     \r
516     private static final class NextCCArgs{\r
517         char[] source;\r
518         int next;\r
519         int limit;\r
520         char c;\r
521         char c2;\r
522     }\r
523     \r
524     /*\r
525      * get the combining class of (c, c2)= args.source[args.next++]\r
526      * before: args.next<args.limit  after: args.next<=args.limit\r
527      * if only one code unit is used, then c2==0\r
528      */\r
529     private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {\r
530         long /*unsigned*/ norm32;\r
531     \r
532         args.c=args.source[args.next++];\r
533         \r
534         norm32= getNorm32(args.c);\r
535         if((norm32 & CC_MASK)==0) {\r
536             args.c2=0;\r
537             return 0;\r
538         } else {\r
539             if(!isNorm32LeadSurrogate(norm32)) {\r
540                 args.c2=0;\r
541             } else {\r
542                 /* c is a lead surrogate, get the real norm32 */\r
543                 if(args.next!=args.limit && \r
544                         UTF16.isTrailSurrogate(args.c2=args.source[args.next])){\r
545                     ++args.next;\r
546                     norm32=getNorm32FromSurrogatePair(norm32, args.c2);\r
547                 } else {\r
548                     args.c2=0;\r
549                     return 0;\r
550                 }\r
551             }\r
552     \r
553             return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));\r
554         }\r
555     }\r
556 \r
557     private static final class PrevArgs{\r
558         char[] src;\r
559         int start;\r
560         int current;\r
561         char c;\r
562         char c2;\r
563     }\r
564     \r
565     /*\r
566      * read backwards and get norm32\r
567      * return 0 if the character is <minC\r
568      * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first \r
569      * surrogate but read second!)\r
570      */\r
571     private static long /*unsigned*/ getPrevNorm32(PrevArgs args,\r
572                                                       int/*unsigned*/ minC, \r
573                                                       int/*unsigned*/ mask) {\r
574         long/*unsigned*/ norm32;\r
575     \r
576         args.c=args.src[--args.current];\r
577         args.c2=0;\r
578     \r
579         /* check for a surrogate before getting norm32 to see if we need to \r
580          * predecrement further \r
581          */\r
582         if(args.c<minC) {\r
583             return 0;\r
584         } else if(!UTF16.isSurrogate(args.c)) {\r
585             return getNorm32(args.c);\r
586         } else if(UTF16.isLeadSurrogate(args.c)) {\r
587             /* unpaired first surrogate */\r
588             return 0;\r
589         } else if(args.current!=args.start && \r
590                     UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {\r
591             --args.current;\r
592             norm32=getNorm32(args.c2);\r
593     \r
594             if((norm32&mask)==0) {\r
595                 /* all surrogate pairs with this lead surrogate have \r
596                  * only irrelevant data \r
597                  */\r
598                 return 0;\r
599             } else {\r
600                 /* norm32 must be a surrogate special */\r
601                 return getNorm32FromSurrogatePair(norm32, args.c);\r
602             }\r
603         } else {\r
604             /* unpaired second surrogate */\r
605             args.c2=0;\r
606             return 0;\r
607         }\r
608     }\r
609     \r
610     /*\r
611      * get the combining class of (c, c2)=*--p\r
612      * before: start<p  after: start<=p\r
613      */\r
614     private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {\r
615 \r
616         return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,\r
617                                                          CC_MASK)>>CC_SHIFT));\r
618     }\r
619 \r
620     /*\r
621      * is this a safe boundary character for NF*D?\r
622      * (lead cc==0)\r
623      */\r
624     public static boolean isNFDSafe(long/*unsigned*/ norm32, \r
625                                      int/*unsigned*/ccOrQCMask, \r
626                                      int/*unsigned*/ decompQCMask) {\r
627         if((norm32&ccOrQCMask)==0) {\r
628             return true; /* cc==0 and no decomposition: this is NF*D safe */\r
629         }\r
630     \r
631         /* inspect its decomposition - maybe a Hangul but not a surrogate here*/\r
632         if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {\r
633             DecomposeArgs args=new DecomposeArgs();\r
634             /* decomposes, get everything from the variable-length extra data */\r
635             decompose(norm32, decompQCMask, args);\r
636             return args.cc==0;\r
637         } else {\r
638             /* no decomposition (or Hangul), test the cc directly */\r
639             return (norm32&CC_MASK)==0;\r
640         }\r
641     }\r
642     \r
643     /*\r
644      * is this (or does its decomposition begin with) a "true starter"?\r
645      * (cc==0 and NF*C_YES)\r
646      */\r
647     public static boolean isTrueStarter(long/*unsigned*/ norm32, \r
648                                           int/*unsigned*/ ccOrQCMask, \r
649                                           int/*unsigned*/ decompQCMask) {\r
650         if((norm32&ccOrQCMask)==0) {\r
651             return true; /* this is a true starter (could be Hangul or Jamo L)*/\r
652         }\r
653     \r
654         /* inspect its decomposition - not a Hangul or a surrogate here */\r
655         if((norm32&decompQCMask)!=0) {\r
656             int p; /* index into extra data array */\r
657             DecomposeArgs args=new DecomposeArgs();\r
658             /* decomposes, get everything from the variable-length extra data */\r
659             p=decompose(norm32, decompQCMask, args);\r
660           \r
661             if(args.cc==0) {\r
662                 int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;\r
663     \r
664                 /* does it begin with NFC_YES? */\r
665                 if((getNorm32(extraData,p, qcMask)&qcMask)==0) {\r
666                     /* yes, the decomposition begins with a true starter */\r
667                     return true;\r
668                 }\r
669             }\r
670         }\r
671         return false;\r
672     }\r
673 \r
674     /* reorder UTF-16 in-place ---------------------------------------------- */\r
675     \r
676     /**\r
677      * simpler, single-character version of mergeOrdered() -\r
678      * bubble-insert one single code point into the preceding string\r
679      * which is already canonically ordered\r
680      * (c, c2) may or may not yet have been inserted at src[current]..src[p]\r
681      *\r
682      * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)\r
683      *\r
684      * before: src[start]..src[current] is already ordered, and\r
685      *         src[current]..src[p]     may or may not hold (c, c2) but\r
686      *                          must be exactly the same length as (c, c2)\r
687      * after: src[start]..src[p] is ordered\r
688      *\r
689      * @return the trailing combining class\r
690      */\r
691     private static int/*unsigned byte*/ insertOrdered(char[] source, \r
692                                                       int start, \r
693                                                       int current, int p,\r
694                                                          char c, char c2, \r
695                                                          int/*unsigned byte*/ cc) {\r
696         int back, preBack;\r
697         int r;\r
698         int prevCC, trailCC=cc;\r
699     \r
700         if(start<current && cc!=0) {\r
701             // search for the insertion point where cc>=prevCC \r
702             preBack=back=current;\r
703             PrevArgs prevArgs = new PrevArgs();\r
704             prevArgs.current  = current;\r
705             prevArgs.start    = start;\r
706             prevArgs.src      = source;\r
707             // get the prevCC \r
708             prevCC=getPrevCC(prevArgs);\r
709             preBack = prevArgs.current;\r
710             \r
711             if(cc<prevCC) {\r
712                 // this will be the last code point, so keep its cc \r
713                 trailCC=prevCC;\r
714                 back=preBack;\r
715                 while(start<preBack) {\r
716                     prevCC=getPrevCC(prevArgs);\r
717                     preBack=prevArgs.current;\r
718                     if(cc>=prevCC) {\r
719                         break;\r
720                     }\r
721                     back=preBack;\r
722                 }\r
723     \r
724                 \r
725                 // this is where we are right now with all these indicies:\r
726                 // [start]..[pPreBack] 0..? code points that we can ignore\r
727                 // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc\r
728                 // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)\r
729                 // [current]..[p]         1 code point (c, c2) with cc\r
730                  \r
731                 // move the code units in between up \r
732                 r=p;\r
733                 do {\r
734                     source[--r]=source[--current];\r
735                 } while(back!=current);\r
736             }\r
737         }\r
738     \r
739         // insert (c, c2) \r
740         source[current]=c;\r
741         if(c2!=0) {\r
742             source[(current+1)]=c2;\r
743         }\r
744     \r
745         // we know the cc of the last code point \r
746         return trailCC;\r
747     }\r
748     \r
749     /**\r
750      * merge two UTF-16 string parts together\r
751      * to canonically order (order by combining classes) their concatenation\r
752      *\r
753      * the two strings may already be adjacent, so that the merging is done \r
754      * in-place if the two strings are not adjacent, then the buffer holding the\r
755      * first one must be large enough\r
756      * the second string may or may not be ordered in itself\r
757      *\r
758      * before: [start]..[current] is already ordered, and\r
759      *         [next]..[limit]    may be ordered in itself, but\r
760      *                          is not in relation to [start..current[\r
761      * after: [start..current+(limit-next)[ is ordered\r
762      *\r
763      * the algorithm is a simple bubble-sort that takes the characters from \r
764      * src[next++] and inserts them in correct combining class order into the \r
765      * preceding part of the string\r
766      *\r
767      * since this function is called much less often than the single-code point\r
768      * insertOrdered(), it just uses that for easier maintenance\r
769      *\r
770      * @return the trailing combining class\r
771      */\r
772     private static int /*unsigned byte*/ mergeOrdered(char[] source,\r
773                                                       int start, \r
774                                                       int current,\r
775                                                       char[] data,\r
776                                                         int next, \r
777                                                         int limit, \r
778                                                         boolean isOrdered) {\r
779             int r;\r
780             int /*unsigned byte*/ cc, trailCC=0;\r
781             boolean adjacent;\r
782         \r
783             adjacent= current==next;\r
784             NextCCArgs ncArgs = new NextCCArgs();\r
785             ncArgs.source = data;\r
786             ncArgs.next   = next;\r
787             ncArgs.limit  = limit;\r
788             \r
789             if(start!=current || !isOrdered) {\r
790                     \r
791                 while(ncArgs.next<ncArgs.limit) {\r
792                     cc=getNextCC(ncArgs);\r
793                     if(cc==0) {\r
794                         // does not bubble back \r
795                         trailCC=0;\r
796                         if(adjacent) {\r
797                             current=ncArgs.next;\r
798                         } else {\r
799                             data[current++]=ncArgs.c;\r
800                             if(ncArgs.c2!=0) {\r
801                                 data[current++]=ncArgs.c2;\r
802                             }\r
803                         }\r
804                         if(isOrdered) {\r
805                             break;\r
806                         } else {\r
807                             start=current;\r
808                         }\r
809                     } else {\r
810                         r=current+(ncArgs.c2==0 ? 1 : 2);\r
811                         trailCC=insertOrdered(source,start, current, r, \r
812                                               ncArgs.c, ncArgs.c2, cc);\r
813                         current=r;\r
814                     }\r
815                 }\r
816             }\r
817         \r
818             if(ncArgs.next==ncArgs.limit) {\r
819                 // we know the cc of the last code point \r
820                 return trailCC;\r
821             } else {\r
822                 if(!adjacent) {\r
823                     // copy the second string part \r
824                     do {\r
825                         source[current++]=data[ncArgs.next++];\r
826                     } while(ncArgs.next!=ncArgs.limit);\r
827                     ncArgs.limit=current;\r
828                 }\r
829                 PrevArgs prevArgs = new PrevArgs();\r
830                 prevArgs.src   = data;\r
831                 prevArgs.start = start;\r
832                 prevArgs.current =  ncArgs.limit;\r
833                 return getPrevCC(prevArgs);\r
834             }\r
835 \r
836     }\r
837     private static int /*unsigned byte*/ mergeOrdered(char[] source,\r
838                                                       int start, \r
839                                                       int current,\r
840                                                       char[] data,\r
841                                                         final int next, \r
842                                                         final int limit) {\r
843         return mergeOrdered(source,start,current,data,next,limit,true);\r
844     } \r
845 \r
846     \r
847       \r
848     public static boolean checkFCD(char[] src,int srcStart, int srcLimit,\r
849                                    UnicodeSet nx) {\r
850 \r
851         char fcd16,c,c2;\r
852         int prevCC=0, cc;\r
853         int i =srcStart, length = srcLimit;\r
854     \r
855         for(;;) {\r
856             for(;;) {\r
857                 if(i==length) {\r
858                     return true;\r
859                 } else if((c=src[i++])<MIN_WITH_LEAD_CC) {\r
860                     prevCC=(int)-c;\r
861                 } else if((fcd16=getFCD16(c))==0) {\r
862                     prevCC=0;\r
863                 } else {\r
864                     break;\r
865                 }\r
866             }\r
867 \r
868             // check one above-minimum, relevant code unit \r
869             if(UTF16.isLeadSurrogate(c)) {\r
870                 // c is a lead surrogate, get the real fcd16 \r
871                 if(i!=length && UTF16.isTrailSurrogate(c2=src[i])) {\r
872                     ++i;\r
873                     fcd16=getFCD16FromSurrogatePair(fcd16, c2);\r
874                 } else {\r
875                     c2=0;\r
876                     fcd16=0;\r
877                 }\r
878             }else{\r
879                 c2=0;\r
880             }\r
881             \r
882             if(nx_contains(nx, c, c2)) {\r
883                 prevCC=0; /* excluded: fcd16==0 */\r
884                 continue;\r
885             }\r
886 \r
887             // prevCC has values from the following ranges:\r
888             // 0..0xff -the previous trail combining class\r
889             // <0      -the negative value of the previous code unit;\r
890             //          that code unit was <MIN_WITH_LEAD_CC and its getFCD16()\r
891             //          was deferred so that average text is checked faster\r
892             //\r
893     \r
894             // check the combining order \r
895             cc=(int)(fcd16>>8);\r
896             if(cc!=0) {\r
897                 if(prevCC<0) {\r
898                     // the previous character was <_NORM_MIN_WITH_LEAD_CC, \r
899                     // we need to get its trail cc \r
900                     //\r
901                     if(!nx_contains(nx, (int)-prevCC)) {\r
902                         prevCC=(int)(FCDTrieImpl.fcdTrie.getBMPValue(\r
903                                              (char)-prevCC)&0xff\r
904                                              ); \r
905                     } else {\r
906                         prevCC=0; /* excluded: fcd16==0 */\r
907                     }\r
908                                       \r
909                 }\r
910     \r
911                 if(cc<prevCC) {\r
912                     return false;\r
913                 }\r
914             }\r
915             prevCC=(int)(fcd16&0xff);\r
916         }\r
917     }\r
918     \r
919     public static Normalizer.QuickCheckResult quickCheck(char[] src,\r
920                                                             int srcStart, \r
921                                                             int srcLimit,\r
922                                                             int minNoMaybe,\r
923                                                             int qcMask,\r
924                                                             int options,\r
925                                                             boolean allowMaybe,\r
926                                                             UnicodeSet nx){\r
927 \r
928         int ccOrQCMask;\r
929         long norm32;\r
930         char c, c2;\r
931         char cc, prevCC;\r
932         long qcNorm32;\r
933         Normalizer.QuickCheckResult result;\r
934         ComposePartArgs args = new ComposePartArgs();\r
935         char[] buffer ;\r
936         int start = srcStart;\r
937         \r
938         if(!isDataLoaded) {\r
939             return Normalizer.MAYBE;\r
940         }\r
941         // initialize \r
942         ccOrQCMask=CC_MASK|qcMask;\r
943         result=Normalizer.YES;\r
944         prevCC=0;\r
945                 \r
946         for(;;) {\r
947             for(;;) {\r
948                 if(srcStart==srcLimit) {\r
949                     return result;\r
950                 } else if((c=src[srcStart++])>=minNoMaybe && \r
951                                   (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {\r
952                     break;\r
953                 }\r
954                 prevCC=0;\r
955             }\r
956             \r
957     \r
958             // check one above-minimum, relevant code unit \r
959             if(isNorm32LeadSurrogate(norm32)) {\r
960                 // c is a lead surrogate, get the real norm32 \r
961                 if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {\r
962                     ++srcStart;\r
963                     norm32=getNorm32FromSurrogatePair(norm32,c2);\r
964                 } else {\r
965                     norm32=0;\r
966                     c2=0;\r
967                 }\r
968             }else{\r
969                 c2=0;\r
970             }\r
971             if(nx_contains(nx, c, c2)) {\r
972                 /* excluded: norm32==0 */\r
973                 norm32=0;\r
974             }\r
975     \r
976             // check the combining order \r
977             cc=(char)((norm32>>CC_SHIFT)&0xFF);\r
978             if(cc!=0 && cc<prevCC) {\r
979                 return Normalizer.NO;\r
980             }\r
981             prevCC=cc;\r
982     \r
983             // check for "no" or "maybe" quick check flags \r
984             qcNorm32 = norm32 & qcMask;\r
985             if((qcNorm32& QC_ANY_NO)>=1) {\r
986                 result= Normalizer.NO;\r
987                 break;\r
988             } else if(qcNorm32!=0) {\r
989                 // "maybe" can only occur for NFC and NFKC \r
990                 if(allowMaybe){\r
991                     result=Normalizer.MAYBE;\r
992                 }else{\r
993                     // normalize a section around here to see if it is really \r
994                     // normalized or not \r
995                     int prevStarter;\r
996                     int/*unsigned*/ decompQCMask;\r
997     \r
998                     decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask \r
999     \r
1000                     // find the previous starter \r
1001 \r
1002                     // set prevStarter to the beginning of the current character \r
1003                     prevStarter=srcStart-1; \r
1004                     if(UTF16.isTrailSurrogate(src[prevStarter])) {\r
1005                         // safe because unpaired surrogates do not result \r
1006                         // in "maybe"\r
1007                         --prevStarter; \r
1008                     }\r
1009 \r
1010                     prevStarter=findPreviousStarter(src, start, prevStarter,\r
1011                                                     ccOrQCMask, decompQCMask,\r
1012                                                     (char)minNoMaybe);\r
1013     \r
1014                     // find the next true starter in [src..limit[ - modifies \r
1015                     // src to point to the next starter \r
1016                     srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, \r
1017                                              decompQCMask,(char) minNoMaybe);\r
1018                     \r
1019                     //set the args for compose part\r
1020                     args.prevCC = prevCC;\r
1021                        \r
1022                     // decompose and recompose [prevStarter..src[ \r
1023                     buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);\r
1024     \r
1025                     // compare the normalized version with the original \r
1026                     if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {\r
1027                         result=Normalizer.NO; // normalization differs \r
1028                         break;\r
1029                     }\r
1030     \r
1031                     // continue after the next starter \r
1032                 }\r
1033             }\r
1034         }\r
1035         return result;\r
1036     } \r
1037  \r
1038        \r
1039     //------------------------------------------------------ \r
1040     // make NFD & NFKD \r
1041     //------------------------------------------------------\r
1042     public static int getDecomposition(int c /*UTF-32*/ , \r
1043                                         boolean compat,\r
1044                                            char[] dest,\r
1045                                            int destStart, \r
1046                                            int destCapacity) {\r
1047             \r
1048         if( (UNSIGNED_INT_MASK & c)<=0x10ffff) {\r
1049             long /*unsigned*/ norm32;\r
1050             int qcMask;\r
1051             int minNoMaybe;\r
1052             int length;\r
1053     \r
1054             // initialize \r
1055             if(!compat) {\r
1056                 minNoMaybe=(int)indexes[INDEX_MIN_NFD_NO_MAYBE];\r
1057                 qcMask=QC_NFD;\r
1058             } else {\r
1059                 minNoMaybe=(int)indexes[INDEX_MIN_NFKD_NO_MAYBE];\r
1060                 qcMask=QC_NFKD;\r
1061             }\r
1062     \r
1063             if(c<minNoMaybe) {\r
1064                 // trivial case \r
1065                 if(destCapacity>0) {\r
1066                     dest[0]=(char)c;\r
1067                 }\r
1068                 return -1;\r
1069             }\r
1070     \r
1071             /* data lookup */\r
1072             norm32=getNorm32(c);\r
1073             if((norm32&qcMask)==0) {\r
1074                 /* simple case: no decomposition */\r
1075                 if(c<=0xffff) {\r
1076                     if(destCapacity>0) {\r
1077                         dest[0]=(char)c;\r
1078                     }\r
1079                     return -1;\r
1080                 } else {\r
1081                     if(destCapacity>=2) {\r
1082                         dest[0]=UTF16.getLeadSurrogate(c);\r
1083                         dest[1]=UTF16.getTrailSurrogate(c);\r
1084                     }\r
1085                     return -2;\r
1086                 }\r
1087             } else if(isNorm32HangulOrJamo(norm32)) {\r
1088                 /* Hangul syllable: decompose algorithmically */\r
1089                 char c2;\r
1090     \r
1091                 c-=HANGUL_BASE;\r
1092     \r
1093                 c2=(char)(c%JAMO_T_COUNT);\r
1094                 c/=JAMO_T_COUNT;\r
1095                 if(c2>0) {\r
1096                     if(destCapacity>=3) {\r
1097                         dest[2]=(char)(JAMO_T_BASE+c2);\r
1098                     }\r
1099                     length=3;\r
1100                 } else {\r
1101                     length=2;\r
1102                 }\r
1103     \r
1104                 if(destCapacity>=2) {\r
1105                     dest[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);\r
1106                     dest[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);\r
1107                 }\r
1108                 return length;\r
1109             } else {\r
1110                 /* c decomposes, get everything from the variable-length extra \r
1111                  * data \r
1112                  */\r
1113                 int p, limit;\r
1114                 DecomposeArgs args = new DecomposeArgs();\r
1115                 /* the index into extra data array*/                 \r
1116                 p=decompose(norm32, qcMask, args);\r
1117                 if(args.length<=destCapacity) {\r
1118                     limit=p+args.length;\r
1119                     do {\r
1120                         dest[destStart++]=extraData[p++];\r
1121                     } while(p<limit);\r
1122                 }\r
1123                 return args.length;\r
1124             }\r
1125         } else {\r
1126             return 0;\r
1127         }\r
1128     }\r
1129 \r
1130     \r
1131     public static int decompose(char[] src,int srcStart,int srcLimit,\r
1132                                 char[] dest,int destStart,int destLimit,\r
1133                                  boolean compat,int[] outTrailCC,\r
1134                                  UnicodeSet nx) {\r
1135                                 \r
1136         char[] buffer = new char[3];\r
1137         int prevSrc;\r
1138         long norm32;\r
1139         int ccOrQCMask, qcMask;\r
1140         int reorderStartIndex, length;\r
1141         char c, c2, minNoMaybe;\r
1142         int/*unsigned byte*/ cc, prevCC, trailCC;\r
1143         char[] p;\r
1144         int pStart;\r
1145         int destIndex = destStart;\r
1146         int srcIndex = srcStart;\r
1147         if(!compat) {\r
1148             minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];\r
1149             qcMask=QC_NFD;\r
1150         } else {\r
1151             minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];\r
1152             qcMask=QC_NFKD;\r
1153         }\r
1154     \r
1155         /* initialize */\r
1156         ccOrQCMask=CC_MASK|qcMask;\r
1157         reorderStartIndex=0;\r
1158         prevCC=0;\r
1159         norm32=0;\r
1160         c=0;\r
1161         pStart=0;\r
1162         \r
1163         cc=trailCC=-1;//initialize to bogus value\r
1164         \r
1165         for(;;) {\r
1166             /* count code units below the minimum or with irrelevant data for \r
1167              * the quick check \r
1168              */\r
1169             prevSrc=srcIndex;\r
1170 \r
1171             while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe || \r
1172                                         ((norm32=getNorm32(c))&ccOrQCMask)==0)){\r
1173                 prevCC=0;\r
1174                 ++srcIndex;\r
1175             }\r
1176 \r
1177             /* copy these code units all at once */\r
1178             if(srcIndex!=prevSrc) {\r
1179                 length=(int)(srcIndex-prevSrc);\r
1180                 if((destIndex+length)<=destLimit) {\r
1181                     System.arraycopy(src,prevSrc,dest,destIndex,length);\r
1182                 }\r
1183               \r
1184                 destIndex+=length;\r
1185                 reorderStartIndex=destIndex;\r
1186             }\r
1187     \r
1188             /* end of source reached? */\r
1189             if(srcIndex==srcLimit) {\r
1190                 break;\r
1191             }\r
1192     \r
1193             /* c already contains *src and norm32 is set for it, increment src*/\r
1194             ++srcIndex;\r
1195     \r
1196             /* check one above-minimum, relevant code unit */\r
1197             /*\r
1198              * generally, set p and length to the decomposition string\r
1199              * in simple cases, p==NULL and (c, c2) will hold the length code \r
1200              * units to append in all cases, set cc to the lead and trailCC to \r
1201              * the trail combining class\r
1202              *\r
1203              * the following merge-sort of the current character into the \r
1204              * preceding, canonically ordered result text will use the \r
1205              * optimized insertOrdered()\r
1206              * if there is only one single code point to process;\r
1207              * this is indicated with p==NULL, and (c, c2) is the character to \r
1208              * insert\r
1209              * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)\r
1210              * for a supplementary character)\r
1211              * otherwise, p[length] is merged in with _mergeOrdered()\r
1212              */\r
1213             if(isNorm32HangulOrJamo(norm32)) {\r
1214                 if(nx_contains(nx, c)) {\r
1215                     c2=0;\r
1216                     p=null;\r
1217                     length=1;\r
1218                 } else {\r
1219                     // Hangul syllable: decompose algorithmically \r
1220                     p=buffer;\r
1221                     pStart=0;\r
1222                     cc=trailCC=0;\r
1223     \r
1224                     c-=HANGUL_BASE;\r
1225     \r
1226                     c2=(char)(c%JAMO_T_COUNT);\r
1227                     c/=JAMO_T_COUNT;\r
1228                     if(c2>0) {\r
1229                         buffer[2]=(char)(JAMO_T_BASE+c2);\r
1230                         length=3;\r
1231                     } else {\r
1232                         length=2;\r
1233                     }\r
1234     \r
1235                     buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);\r
1236                     buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);\r
1237                 }\r
1238             } else {\r
1239                 if(isNorm32Regular(norm32)) {\r
1240                     c2=0;\r
1241                     length=1;\r
1242                 } else {\r
1243                     // c is a lead surrogate, get the real norm32 \r
1244                     if(srcIndex!=srcLimit && \r
1245                                     UTF16.isTrailSurrogate(c2=src[srcIndex])) {\r
1246                         ++srcIndex;\r
1247                         length=2;\r
1248                         norm32=getNorm32FromSurrogatePair(norm32, c2);\r
1249                     } else {\r
1250                         c2=0;\r
1251                         length=1;\r
1252                         norm32=0;\r
1253                     }\r
1254                 }\r
1255     \r
1256                 /* get the decomposition and the lead and trail cc's */\r
1257                 if(nx_contains(nx, c, c2)) {\r
1258                     /* excluded: norm32==0 */\r
1259                     cc=trailCC=0;\r
1260                     p=null;\r
1261                 } else if((norm32&qcMask)==0) {\r
1262                     /* c does not decompose */\r
1263                     cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));\r
1264                     p=null;\r
1265                     pStart=-1;\r
1266                 } else {\r
1267                     DecomposeArgs arg = new DecomposeArgs();\r
1268                     /* c decomposes, get everything from the variable-length \r
1269                      * extra data \r
1270                      */\r
1271                     pStart=decompose(norm32, qcMask, arg);\r
1272                     p=extraData;\r
1273                     length=arg.length;\r
1274                     cc=arg.cc;\r
1275                     trailCC=arg.trailCC;\r
1276                     if(length==1) {\r
1277                         /* fastpath a single code unit from decomposition */\r
1278                         c=p[pStart];\r
1279                         c2=0;\r
1280                         p=null;\r
1281                         pStart=-1;\r
1282                     }\r
1283                 }\r
1284             }\r
1285     \r
1286             /* append the decomposition to the destination buffer, assume \r
1287              * length>0 \r
1288              */\r
1289             if((destIndex+length)<=destLimit) {\r
1290                 int reorderSplit=destIndex;\r
1291                 if(p==null) {\r
1292                     /* fastpath: single code point */\r
1293                     if(cc!=0 && cc<prevCC) {\r
1294                         /* (c, c2) is out of order with respect to the preceding\r
1295                          *  text \r
1296                          */\r
1297                         destIndex+=length;\r
1298                         trailCC=insertOrdered(dest,reorderStartIndex, \r
1299                                             reorderSplit, destIndex, c, c2, cc);\r
1300                     } else {\r
1301                         /* just append (c, c2) */\r
1302                         dest[destIndex++]=c;\r
1303                         if(c2!=0) {\r
1304                             dest[destIndex++]=c2;\r
1305                         }\r
1306                     }\r
1307                 } else {\r
1308                     /* general: multiple code points (ordered by themselves) \r
1309                      * from decomposition \r
1310                      */\r
1311                     if(cc!=0 && cc<prevCC) {\r
1312                         /* the decomposition is out of order with respect to the\r
1313                          *  preceding text \r
1314                          */\r
1315                         destIndex+=length;\r
1316                         trailCC=mergeOrdered(dest,reorderStartIndex, \r
1317                                           reorderSplit,p, pStart,pStart+length);\r
1318                     } else {\r
1319                         /* just append the decomposition */\r
1320                         do {\r
1321                             dest[destIndex++]=p[pStart++];\r
1322                         } while(--length>0);\r
1323                     }\r
1324                 }\r
1325             } else {\r
1326                 /* buffer overflow */\r
1327                 /* keep incrementing the destIndex for preflighting */\r
1328                 destIndex+=length;\r
1329             }\r
1330     \r
1331             prevCC=trailCC;\r
1332             if(prevCC==0) {\r
1333                 reorderStartIndex=destIndex;\r
1334             }\r
1335         }\r
1336     \r
1337         outTrailCC[0]=prevCC;\r
1338 \r
1339         return destIndex - destStart;\r
1340     }\r
1341     \r
1342     /* make NFC & NFKC ------------------------------------------------------ */\r
1343     private static final class NextCombiningArgs{\r
1344         char[] source;\r
1345         int start;\r
1346         //int limit;\r
1347         char c;\r
1348         char c2;\r
1349         int/*unsigned*/ combiningIndex;\r
1350         char /*unsigned byte*/ cc;\r
1351     }\r
1352     \r
1353     /* get the composition properties of the next character */\r
1354     private static int /*unsigned*/    getNextCombining(NextCombiningArgs args,\r
1355                                                     int limit,\r
1356                                                     UnicodeSet nx) {\r
1357         long/*unsigned*/ norm32; \r
1358         int combineFlags;\r
1359         /* get properties */\r
1360         args.c=args.source[args.start++];\r
1361         norm32=getNorm32(args.c);\r
1362         \r
1363         /* preset output values for most characters */\r
1364         args.c2=0;\r
1365         args.combiningIndex=0;\r
1366         args.cc=0;\r
1367         \r
1368         if((norm32&(CC_MASK|COMBINES_ANY))==0) {\r
1369             return 0;\r
1370         } else {\r
1371             if(isNorm32Regular(norm32)) {\r
1372                 /* set cc etc. below */\r
1373             } else if(isNorm32HangulOrJamo(norm32)) {\r
1374                 /* a compatibility decomposition contained Jamos */\r
1375                 args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|\r
1376                                                         (norm32>>EXTRA_SHIFT)));\r
1377                 return (int)(norm32&COMBINES_ANY);\r
1378             } else {\r
1379                 /* c is a lead surrogate, get the real norm32 */\r
1380                 if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=\r
1381                                                      args.source[args.start])) {\r
1382                     ++args.start;\r
1383                     norm32=getNorm32FromSurrogatePair(norm32, args.c2);\r
1384                 } else {\r
1385                     args.c2=0;\r
1386                     return 0;\r
1387                 }\r
1388             }\r
1389             \r
1390             if(nx_contains(nx, args.c, args.c2)) {\r
1391                 return 0; /* excluded: norm32==0 */\r
1392             }\r
1393     \r
1394             args.cc= (char)((norm32>>CC_SHIFT)&0xff);\r
1395         \r
1396             combineFlags=(int)(norm32&COMBINES_ANY);\r
1397             if(combineFlags!=0) {\r
1398                 int index = getExtraDataIndex(norm32);\r
1399                 args.combiningIndex=index>0 ? extraData[(index-1)] :0;\r
1400             }\r
1401     \r
1402             return combineFlags;\r
1403         }\r
1404     }\r
1405     \r
1406     /*\r
1407      * given a composition-result starter (c, c2) - which means its cc==0,\r
1408      * it combines forward, it has extra data, its norm32!=0,\r
1409      * it is not a Hangul or Jamo,\r
1410      * get just its combineFwdIndex\r
1411      *\r
1412      * norm32(c) is special if and only if c2!=0\r
1413      */\r
1414     private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){\r
1415         long/*unsigned*/ norm32;\r
1416     \r
1417         norm32=getNorm32(c);\r
1418         if(c2!=0) {\r
1419             norm32=getNorm32FromSurrogatePair(norm32, c2);\r
1420         }\r
1421         return extraData[(getExtraDataIndex(norm32)-1)];\r
1422     }\r
1423     \r
1424     /*\r
1425      * Find the recomposition result for\r
1426      * a forward-combining character\r
1427      * (specified with a pointer to its part of the combiningTable[])\r
1428      * and a backward-combining character\r
1429      * (specified with its combineBackIndex).\r
1430      *\r
1431      * If these two characters combine, then set (value, value2)\r
1432      * with the code unit(s) of the composition character.\r
1433      *\r
1434      * Return value:\r
1435      * 0    do not combine\r
1436      * 1    combine\r
1437      * >1   combine, and the composition is a forward-combining starter\r
1438      *\r
1439      * See unormimp.h for a description of the composition table format.\r
1440      */\r
1441     private static int/*unsigned*/ combine(char[]table,int tableStart, \r
1442                                    int/*unsinged*/ combineBackIndex,\r
1443                                     int[] outValues) {\r
1444         int/*unsigned*/ key;\r
1445         int value,value2;\r
1446         \r
1447         if(outValues.length<2){\r
1448             throw new IllegalArgumentException();\r
1449         }\r
1450         \r
1451         /* search in the starter's composition table */\r
1452         for(;;) {\r
1453             key=table[tableStart++];\r
1454             if(key>=combineBackIndex) {\r
1455                 break;\r
1456             }\r
1457             tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;\r
1458         }\r
1459     \r
1460         /* mask off bit 15, the last-entry-in-the-list flag */\r
1461         if((key&0x7fff)==combineBackIndex) {\r
1462             /* found! combine! */\r
1463             value=table[tableStart];\r
1464     \r
1465             /* is the composition a starter that combines forward? */\r
1466             key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));\r
1467     \r
1468             /* get the composition result code point from the variable-length \r
1469              * result value \r
1470              */\r
1471             if((value&0x8000) != 0) {\r
1472                 if((value&0x4000) != 0) {\r
1473                     /* surrogate pair composition result */\r
1474                     value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));\r
1475                     value2=table[tableStart+1];\r
1476                 } else {\r
1477                     /* BMP composition result U+2000..U+ffff */\r
1478                     value=table[tableStart+1];\r
1479                     value2=0;\r
1480                 }\r
1481             } else {\r
1482                 /* BMP composition result U+0000..U+1fff */\r
1483                 value&=0x1fff;\r
1484                 value2=0;\r
1485             }\r
1486             outValues[0]=value;\r
1487             outValues[1]=value2;    \r
1488             return key;\r
1489         } else {\r
1490             /* not found */\r
1491             return 0;\r
1492         }\r
1493     }\r
1494     \r
1495     \r
1496     private static final class RecomposeArgs{\r
1497         char[] source;\r
1498         int start;\r
1499         int limit;\r
1500     }\r
1501     /*\r
1502      * recompose the characters in [p..limit[\r
1503      * (which is in NFD - decomposed and canonically ordered),\r
1504      * adjust limit, and return the trailing cc\r
1505      *\r
1506      * since for NFKC we may get Jamos in decompositions, we need to\r
1507      * recompose those too\r
1508      *\r
1509      * note that recomposition never lengthens the text:\r
1510      * any character consists of either one or two code units;\r
1511      * a composition may contain at most one more code unit than the original \r
1512      * starter, while the combining mark that is removed has at least one code \r
1513      * unit\r
1514      */\r
1515     private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {\r
1516         int  remove, q, r;\r
1517         int /*unsigned*/ combineFlags;\r
1518         int /*unsigned*/ combineFwdIndex, combineBackIndex;\r
1519         int /*unsigned*/ result, value=0, value2=0;\r
1520         int /*unsigned byte*/  prevCC;\r
1521         boolean starterIsSupplementary;\r
1522         int starter;\r
1523         int[] outValues = new int[2];\r
1524         starter=-1;                   /* no starter */\r
1525         combineFwdIndex=0;            /* will not be used until starter!=NULL */\r
1526         starterIsSupplementary=false; /* will not be used until starter!=NULL */\r
1527         prevCC=0;\r
1528         \r
1529         NextCombiningArgs ncArg = new NextCombiningArgs();\r
1530         ncArg.source  = args.source;\r
1531         \r
1532         ncArg.cc      =0;\r
1533         ncArg.c2      =0;    \r
1534 \r
1535         for(;;) {\r
1536             ncArg.start = args.start;\r
1537             combineFlags=getNextCombining(ncArg,args.limit,nx);\r
1538             combineBackIndex=ncArg.combiningIndex;\r
1539             args.start = ncArg.start;\r
1540                         \r
1541             if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {\r
1542                 if((combineBackIndex&0x8000)!=0) {\r
1543                     /* c is a Jamo V/T, see if we can compose it with the \r
1544                      * previous character \r
1545                      */\r
1546                     /* for the PRI #29 fix, check that there is no intervening combining mark */\r
1547                     if((options&BEFORE_PRI_29)!=0 || prevCC==0) {\r
1548                         remove=-1; /* NULL while no Hangul composition */\r
1549                         combineFlags=0;\r
1550                         ncArg.c2=args.source[starter];\r
1551                         if(combineBackIndex==0xfff2) {\r
1552                             /* Jamo V, compose with previous Jamo L and following \r
1553                              * Jamo T \r
1554                              */\r
1555                             ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);\r
1556                             if(ncArg.c2<JAMO_L_COUNT) {\r
1557                                 remove=args.start-1;\r
1558                                 ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+\r
1559                                                (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);\r
1560                                 if(args.start!=args.limit && \r
1561                                             (ncArg.c2=(char)(args.source[args.start]\r
1562                                              -JAMO_T_BASE))<JAMO_T_COUNT) {\r
1563                                     ++args.start;\r
1564                                     ncArg.c+=ncArg.c2;\r
1565                                  } else {\r
1566                                      /* the result is an LV syllable, which is a starter (unlike LVT) */\r
1567                                      combineFlags=COMBINES_FWD;\r
1568                                 }\r
1569                                 if(!nx_contains(nx, ncArg.c)) {\r
1570                                     args.source[starter]=ncArg.c;\r
1571                                    } else {\r
1572                                     /* excluded */\r
1573                                     if(!isHangulWithoutJamoT(ncArg.c)) {\r
1574                                         --args.start; /* undo the ++args.start from reading the Jamo T */\r
1575                                     }\r
1576                                     /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */\r
1577                                     remove=args.start;\r
1578                                 }\r
1579                             }\r
1580 \r
1581                         /*\r
1582                          * Normally, the following can not occur:\r
1583                          * Since the input is in NFD, there are no Hangul LV syllables that\r
1584                          * a Jamo T could combine with.\r
1585                          * All Jamo Ts are combined above when handling Jamo Vs.\r
1586                          *\r
1587                          * However, before the PRI #29 fix, this can occur due to\r
1588                          * an intervening combining mark between the Hangul LV and the Jamo T.\r
1589                          */\r
1590                         } else {\r
1591                             /* Jamo T, compose with previous Hangul that does not have a Jamo T */\r
1592                             if(isHangulWithoutJamoT(ncArg.c2)) {\r
1593                                 ncArg.c2+=ncArg.c-JAMO_T_BASE;\r
1594                                 if(!nx_contains(nx, ncArg.c2)) {\r
1595                                     remove=args.start-1;\r
1596                                     args.source[starter]=ncArg.c2;\r
1597                                 }\r
1598                             }\r
1599                         }\r
1600         \r
1601                         if(remove!=-1) {\r
1602                             /* remove the Jamo(s) */\r
1603                             q=remove;\r
1604                             r=args.start;\r
1605                             while(r<args.limit) {\r
1606                                 args.source[q++]=args.source[r++];\r
1607                             }\r
1608                             args.start=remove;\r
1609                             args.limit=q;\r
1610                         }\r
1611         \r
1612                         ncArg.c2=0; /* c2 held *starter temporarily */\r
1613 \r
1614                         if(combineFlags!=0) {\r
1615                             /*\r
1616                              * not starter=NULL because the composition is a Hangul LV syllable\r
1617                              * and might combine once more (but only before the PRI #29 fix)\r
1618                              */\r
1619 \r
1620                             /* done? */\r
1621                             if(args.start==args.limit) {\r
1622                                 return (char)prevCC;\r
1623                             }\r
1624 \r
1625                             /* the composition is a Hangul LV syllable which is a starter that combines forward */\r
1626                             combineFwdIndex=0xfff0;\r
1627 \r
1628                             /* we combined; continue with looking for compositions */\r
1629                             continue;\r
1630                         }\r
1631                     }\r
1632 \r
1633                     /*\r
1634                      * now: cc==0 and the combining index does not include \r
1635                      * "forward" -> the rest of the loop body will reset starter\r
1636                      * to NULL; technically, a composed Hangul syllable is a \r
1637                      * starter, but it does not combine forward now that we have\r
1638                      * consumed all eligible Jamos; for Jamo V/T, combineFlags \r
1639                      * does not contain _NORM_COMBINES_FWD\r
1640                      */\r
1641     \r
1642                 } else if(\r
1643                     /* the starter is not a Hangul LV or Jamo V/T and */\r
1644                     !((combineFwdIndex&0x8000)!=0) &&\r
1645                     /* the combining mark is not blocked and */\r
1646                     ((options&BEFORE_PRI_29)!=0 ?\r
1647                         (prevCC!=ncArg.cc || prevCC==0) :\r
1648                         (prevCC<ncArg.cc || prevCC==0)) &&\r
1649                     /* the starter and the combining mark (c, c2) do combine */\r
1650                     0!=(result=combine(combiningTable,combineFwdIndex, \r
1651                                        combineBackIndex, outValues)) &&\r
1652                     /* the composition result is not excluded */\r
1653                     !nx_contains(nx, (char)value, (char)value2)\r
1654                 ) {\r
1655                     value=outValues[0];\r
1656                     value2=outValues[1];\r
1657                     /* replace the starter with the composition, remove the \r
1658                      * combining mark \r
1659                      */\r
1660                     remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */\r
1661     \r
1662                     /* replace the starter with the composition */\r
1663                     args.source[starter]=(char)value;\r
1664                     if(starterIsSupplementary) {\r
1665                         if(value2!=0) {\r
1666                             /* both are supplementary */\r
1667                             args.source[starter+1]=(char)value2;\r
1668                         } else {\r
1669                             /* the composition is shorter than the starter, \r
1670                              * move the intermediate characters forward one */\r
1671                             starterIsSupplementary=false;\r
1672                             q=starter+1;\r
1673                             r=q+1;\r
1674                             while(r<remove) {\r
1675                                 args.source[q++]=args.source[r++];\r
1676                             }\r
1677                             --remove;\r
1678                         }\r
1679                     } else if(value2!=0) {\r
1680                         /* the composition is longer than the starter, \r
1681                          * move the intermediate characters back one */\r
1682                         starterIsSupplementary=true;\r
1683                         /* temporarily increment for the loop boundary */\r
1684                         ++starter; \r
1685                         q=remove;\r
1686                         r=++remove;\r
1687                         while(starter<q) {\r
1688                             args.source[--r]=args.source[--q];\r
1689                         }\r
1690                         args.source[starter]=(char)value2;\r
1691                         --starter; /* undo the temporary increment */\r
1692                     /* } else { both are on the BMP, nothing more to do */\r
1693                     }\r
1694     \r
1695                     /* remove the combining mark by moving the following text \r
1696                      * over it */\r
1697                     if(remove<args.start) {\r
1698                         q=remove;\r
1699                         r=args.start;\r
1700                         while(r<args.limit) {\r
1701                             args.source[q++]=args.source[r++];\r
1702                         }\r
1703                         args.start=remove;\r
1704                         args.limit=q;\r
1705                     }\r
1706     \r
1707                     /* keep prevCC because we removed the combining mark */\r
1708     \r
1709                     /* done? */\r
1710                     if(args.start==args.limit) {\r
1711                         return (char)prevCC;\r
1712                     }\r
1713     \r
1714                     /* is the composition a starter that combines forward? */\r
1715                     if(result>1) {\r
1716                        combineFwdIndex=getCombiningIndexFromStarter((char)value,\r
1717                                                                   (char)value2);\r
1718                     } else {\r
1719                        starter=-1;\r
1720                     }\r
1721     \r
1722                     /* we combined; continue with looking for compositions */\r
1723                     continue;\r
1724                 }\r
1725             }\r
1726     \r
1727             /* no combination this time */\r
1728             prevCC=ncArg.cc;\r
1729             if(args.start==args.limit) {\r
1730                 return (char)prevCC;\r
1731             }\r
1732     \r
1733             /* if (c, c2) did not combine, then check if it is a starter */\r
1734             if(ncArg.cc==0) {\r
1735                 /* found a new starter; combineFlags==0 if (c, c2) is excluded */\r
1736                 if((combineFlags&COMBINES_FWD)!=0) {\r
1737                     /* it may combine with something, prepare for it */\r
1738                     if(ncArg.c2==0) {\r
1739                         starterIsSupplementary=false;\r
1740                         starter=args.start-1;\r
1741                     } else {\r
1742                         starterIsSupplementary=false;\r
1743                         starter=args.start-2;\r
1744                     }\r
1745                     combineFwdIndex=combineBackIndex;\r
1746                 } else {\r
1747                     /* it will not combine with anything */\r
1748                     starter=-1;\r
1749                 }\r
1750             } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {\r
1751                 /* FCC: no discontiguous compositions; any intervening character blocks */\r
1752                 starter=-1;\r
1753             }\r
1754         }\r
1755     }\r
1756    \r
1757     // find the last true starter between src[start]....src[current] going \r
1758     // backwards and return its index\r
1759     private static int findPreviousStarter(char[]src, int srcStart, int current, \r
1760                                           int/*unsigned*/ ccOrQCMask, \r
1761                                           int/*unsigned*/ decompQCMask,\r
1762                                           char minNoMaybe) { \r
1763        long norm32; \r
1764        PrevArgs args = new PrevArgs();\r
1765        args.src = src;\r
1766        args.start = srcStart;\r
1767        args.current = current;\r
1768        \r
1769        while(args.start<args.current) { \r
1770            norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask); \r
1771            if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { \r
1772                break; \r
1773            } \r
1774        } \r
1775        return args.current; \r
1776     }\r
1777     \r
1778     /* find the first true starter in [src..limit[ and return the \r
1779      * pointer to it \r
1780      */\r
1781     private static int/*index*/    findNextStarter(char[] src,int start,int limit,\r
1782                                                  int/*unsigned*/ qcMask, \r
1783                                                  int/*unsigned*/ decompQCMask, \r
1784                                                  char minNoMaybe) {\r
1785         int p;\r
1786         long/*unsigned*/ norm32; \r
1787         int ccOrQCMask;\r
1788         char c, c2;\r
1789     \r
1790         ccOrQCMask=CC_MASK|qcMask;\r
1791         \r
1792         DecomposeArgs decompArgs = new DecomposeArgs();\r
1793 \r
1794         for(;;) {\r
1795             if(start==limit) {\r
1796                 break; /* end of string */\r
1797             }\r
1798             c=src[start];\r
1799             if(c<minNoMaybe) {\r
1800                 break; /* catches NUL terminater, too */\r
1801             }\r
1802     \r
1803             norm32=getNorm32(c);\r
1804             if((norm32&ccOrQCMask)==0) {\r
1805                 break; /* true starter */\r
1806             }\r
1807     \r
1808             if(isNorm32LeadSurrogate(norm32)) {\r
1809                 /* c is a lead surrogate, get the real norm32 */\r
1810                 if((start+1)==limit || \r
1811                                    !UTF16.isTrailSurrogate(c2=(src[start+1]))){\r
1812                     /* unmatched first surrogate: counts as a true starter */                  \r
1813                     break; \r
1814                 }\r
1815                 norm32=getNorm32FromSurrogatePair(norm32, c2);\r
1816     \r
1817                 if((norm32&ccOrQCMask)==0) {\r
1818                     break; /* true starter */\r
1819                 }\r
1820             } else {\r
1821                 c2=0;\r
1822             }\r
1823     \r
1824             /* (c, c2) is not a true starter but its decomposition may be */\r
1825             if((norm32&decompQCMask)!=0) {\r
1826                 /* (c, c2) decomposes, get everything from the variable-length\r
1827                  *  extra data */\r
1828                 p=decompose(norm32, decompQCMask, decompArgs);\r
1829     \r
1830                 /* get the first character's norm32 to check if it is a true \r
1831                  * starter */\r
1832                 if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {\r
1833                     break; /* true starter */\r
1834                 }\r
1835             }\r
1836     \r
1837             start+= c2==0 ? 1 : 2; /* not a true starter, continue */\r
1838         }\r
1839     \r
1840         return start;\r
1841     }\r
1842     \r
1843     \r
1844     private static final class ComposePartArgs{\r
1845         int prevCC;\r
1846         int length;   /* length of decomposed part */\r
1847     }\r
1848         \r
1849      /* decompose and recompose [prevStarter..src[ */\r
1850     private static char[] composePart(ComposePartArgs args, \r
1851                                       int prevStarter, \r
1852                                          char[] src, int start, int limit,\r
1853                                        int options,\r
1854                                        UnicodeSet nx) {\r
1855         int recomposeLimit;\r
1856         boolean compat =((options&OPTIONS_COMPAT)!=0);\r
1857         \r
1858         /* decompose [prevStarter..src[ */\r
1859         int[] outTrailCC = new int[1];\r
1860         char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];\r
1861 \r
1862         for(;;){\r
1863             args.length=decompose(src,prevStarter,(start),\r
1864                                       buffer,0,buffer.length, \r
1865                                       compat,outTrailCC,nx);\r
1866             if(args.length<=buffer.length){\r
1867                 break;\r
1868             }else{\r
1869                 buffer = new char[args.length];\r
1870             }\r
1871         } \r
1872     \r
1873         /* recompose the decomposition */\r
1874         recomposeLimit=args.length;\r
1875           \r
1876         if(args.length>=2) {\r
1877             RecomposeArgs rcArgs = new RecomposeArgs();\r
1878             rcArgs.source    = buffer;\r
1879             rcArgs.start    = 0;\r
1880             rcArgs.limit    = recomposeLimit; \r
1881             args.prevCC=recompose(rcArgs, options, nx);\r
1882             recomposeLimit = rcArgs.limit;\r
1883         }\r
1884         \r
1885         /* return with a pointer to the recomposition and its length */\r
1886         args.length=recomposeLimit;\r
1887         return buffer;\r
1888     }\r
1889     \r
1890     private static boolean composeHangul(char prev, char c,\r
1891                                          long/*unsigned*/ norm32, \r
1892                                          char[] src,int[] srcIndex, int limit,\r
1893                                             boolean compat, \r
1894                                          char[] dest,int destIndex,\r
1895                                          UnicodeSet nx) {\r
1896         int start=srcIndex[0];\r
1897         if(isJamoVTNorm32JamoV(norm32)) {\r
1898             /* c is a Jamo V, compose with previous Jamo L and \r
1899              * following Jamo T */\r
1900             prev=(char)(prev-JAMO_L_BASE);\r
1901             if(prev<JAMO_L_COUNT) {\r
1902                 c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+\r
1903                                                  (c-JAMO_V_BASE))*JAMO_T_COUNT);\r
1904     \r
1905                 /* check if the next character is a Jamo T (normal or \r
1906                  * compatibility) */\r
1907                 if(start!=limit) {\r
1908                     char next, t;\r
1909     \r
1910                     next=src[start];\r
1911                     if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {\r
1912                         /* normal Jamo T */\r
1913                         ++start;\r
1914                         c+=t;\r
1915                     } else if(compat) {\r
1916                         /* if NFKC, then check for compatibility Jamo T \r
1917                          * (BMP only) */\r
1918                         norm32=getNorm32(next);\r
1919                         if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {\r
1920                             int p /*index into extra data array*/;\r
1921                             DecomposeArgs dcArgs = new DecomposeArgs();\r
1922                             p=decompose(norm32, QC_NFKD, dcArgs);\r
1923                             if(dcArgs.length==1 && \r
1924                                    (t=(char)(extraData[p]-JAMO_T_BASE))\r
1925                                                    <JAMO_T_COUNT) {\r
1926                                 /* compatibility Jamo T */\r
1927                                 ++start;\r
1928                                 c+=t;\r
1929                             }\r
1930                         }\r
1931                     }\r
1932                 }\r
1933                 if(nx_contains(nx, c)) {\r
1934                     if(!isHangulWithoutJamoT(c)) {\r
1935                         --start; /* undo ++start from reading the Jamo T */\r
1936                     }\r
1937                     return false;\r
1938                 }\r
1939                 dest[destIndex]=c;\r
1940                 srcIndex[0]=start;\r
1941                 return true;\r
1942             }\r
1943         } else if(isHangulWithoutJamoT(prev)) {\r
1944             /* c is a Jamo T, compose with previous Hangul LV that does not \r
1945              * contain a Jamo T */\r
1946             c=(char)(prev+(c-JAMO_T_BASE));\r
1947             if(nx_contains(nx, c)) {\r
1948                 return false;\r
1949             }\r
1950             dest[destIndex]=c;\r
1951             srcIndex[0]=start;\r
1952             return true;\r
1953         }\r
1954         return false;\r
1955     }\r
1956     /*\r
1957     public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){\r
1958         return compose(src,0,src.length,dest,0,dest.length,compat, nx);\r
1959     }\r
1960     */\r
1961     \r
1962     public static int compose(char[] src, int srcStart, int srcLimit,\r
1963                               char[] dest,int destStart,int destLimit,\r
1964                               int options,UnicodeSet nx) {\r
1965         \r
1966         int prevSrc, prevStarter;\r
1967         long/*unsigned*/ norm32; \r
1968         int ccOrQCMask, qcMask;\r
1969         int  reorderStartIndex, length;\r
1970         char c, c2, minNoMaybe;\r
1971         int/*unsigned byte*/ cc, prevCC;\r
1972         int[] ioIndex = new int[1];\r
1973         int destIndex = destStart;\r
1974         int srcIndex = srcStart;\r
1975         \r
1976         if((options&OPTIONS_COMPAT)!=0) {\r
1977             minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];\r
1978             qcMask=QC_NFKC;\r
1979         } else {\r
1980             minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];\r
1981             qcMask=QC_NFC;\r
1982         }\r
1983     \r
1984         /*\r
1985          * prevStarter points to the last character before the current one\r
1986          * that is a "true" starter with cc==0 and quick check "yes".\r
1987          *\r
1988          * prevStarter will be used instead of looking for a true starter\r
1989          * while incrementally decomposing [prevStarter..prevSrc[\r
1990          * in _composePart(). Having a good prevStarter allows to just decompose\r
1991          * the entire [prevStarter..prevSrc[.\r
1992          *\r
1993          * When _composePart() backs out from prevSrc back to prevStarter,\r
1994          * then it also backs out destIndex by the same amount.\r
1995          * Therefore, at all times, the (prevSrc-prevStarter) source units\r
1996          * must correspond 1:1 to destination units counted with destIndex,\r
1997          * except for reordering.\r
1998          * This is true for the qc "yes" characters copied in the fast loop,\r
1999          * and for pure reordering.\r
2000          * prevStarter must be set forward to src when this is not true:\r
2001          * In _composePart() and after composing a Hangul syllable.\r
2002          *\r
2003          * This mechanism relies on the assumption that the decomposition of a \r
2004          * true starter also begins with a true starter. gennorm/store.c checks \r
2005          * for this.\r
2006          */\r
2007         prevStarter=srcIndex;\r
2008     \r
2009         ccOrQCMask=CC_MASK|qcMask;\r
2010         /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/\r
2011         prevCC=0;\r
2012     \r
2013         /* avoid compiler warnings */\r
2014         norm32=0;\r
2015         c=0;\r
2016     \r
2017         for(;;) {\r
2018             /* count code units below the minimum or with irrelevant data for \r
2019              * the quick check */\r
2020             prevSrc=srcIndex;\r
2021 \r
2022             while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe || \r
2023                      ((norm32=getNorm32(c))&ccOrQCMask)==0)) {\r
2024                 prevCC=0;\r
2025                 ++srcIndex;\r
2026             }\r
2027 \r
2028     \r
2029             /* copy these code units all at once */\r
2030             if(srcIndex!=prevSrc) {\r
2031                 length=(int)(srcIndex-prevSrc);\r
2032                 if((destIndex+length)<=destLimit) {\r
2033                     System.arraycopy(src,prevSrc,dest,destIndex,length);\r
2034                 }\r
2035                 destIndex+=length;\r
2036                 reorderStartIndex=destIndex;\r
2037     \r
2038                 /* set prevStarter to the last character in the quick check \r
2039                  * loop */\r
2040                 prevStarter=srcIndex-1;\r
2041                 if(UTF16.isTrailSurrogate(src[prevStarter]) && \r
2042                     prevSrc<prevStarter && \r
2043                     UTF16.isLeadSurrogate(src[(prevStarter-1)])) {\r
2044                     --prevStarter;\r
2045                 }\r
2046     \r
2047                 prevSrc=srcIndex;\r
2048             }\r
2049     \r
2050             /* end of source reached? */\r
2051             if(srcIndex==srcLimit) {\r
2052                 break;\r
2053             }\r
2054     \r
2055             /* c already contains *src and norm32 is set for it, increment src*/\r
2056             ++srcIndex;\r
2057     \r
2058             /*\r
2059              * source buffer pointers:\r
2060              *\r
2061              *  all done      quick check   current char  not yet\r
2062              *                "yes" but     (c, c2)       processed\r
2063              *                may combine\r
2064              *                forward\r
2065              * [-------------[-------------[-------------[-------------[\r
2066              * |             |             |             |             |\r
2067              * start         prevStarter   prevSrc       src           limit\r
2068              *\r
2069              *\r
2070              * destination buffer pointers and indexes:\r
2071              *\r
2072              *  all done      might take    not filled yet\r
2073              *                characters for\r
2074              *                reordering\r
2075              * [-------------[-------------[-------------[\r
2076              * |             |             |             |\r
2077              * dest      reorderStartIndex destIndex     destCapacity\r
2078              */\r
2079     \r
2080             /* check one above-minimum, relevant code unit */\r
2081             /*\r
2082              * norm32 is for c=*(src-1), and the quick check flag is "no" or \r
2083              * "maybe", and/or cc!=0\r
2084              * check for Jamo V/T, then for surrogates and regular characters\r
2085              * c is not a Hangul syllable or Jamo L because\r
2086              * they are not marked with no/maybe for NFC & NFKC(and their cc==0)\r
2087              */\r
2088             if(isNorm32HangulOrJamo(norm32)) {\r
2089                 /*\r
2090                  * c is a Jamo V/T:\r
2091                  * try to compose with the previous character, Jamo V also with \r
2092                  * a following Jamo T, and set values here right now in case we \r
2093                  * just continue with the main loop\r
2094                  */\r
2095                 prevCC=cc=0;\r
2096                 reorderStartIndex=destIndex;\r
2097                 ioIndex[0]=srcIndex;\r
2098                 if( \r
2099                     destIndex>0 &&\r
2100                     composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,\r
2101                                   srcLimit, (options&OPTIONS_COMPAT)!=0, dest,\r
2102                                   destIndex<=destLimit ? destIndex-1: 0,\r
2103                                   nx)\r
2104                 ) {\r
2105                     srcIndex=ioIndex[0];\r
2106                     prevStarter=srcIndex;\r
2107                     continue;\r
2108                 }\r
2109                 \r
2110                 srcIndex = ioIndex[0];\r
2111     \r
2112                 /* the Jamo V/T did not compose into a Hangul syllable, just \r
2113                  * append to dest */\r
2114                 c2=0;\r
2115                 length=1;\r
2116                 prevStarter=prevSrc;\r
2117             } else {\r
2118                 if(isNorm32Regular(norm32)) {\r
2119                     c2=0;\r
2120                     length=1;\r
2121                 } else {\r
2122                     /* c is a lead surrogate, get the real norm32 */\r
2123                     if(srcIndex!=srcLimit &&\r
2124                                      UTF16.isTrailSurrogate(c2=src[srcIndex])) {\r
2125                         ++srcIndex;\r
2126                         length=2;\r
2127                         norm32=getNorm32FromSurrogatePair(norm32, c2);\r
2128                     } else {\r
2129                         /* c is an unpaired lead surrogate, nothing to do */\r
2130                         c2=0;\r
2131                         length=1;\r
2132                         norm32=0;\r
2133                     }\r
2134                 }\r
2135                 ComposePartArgs args =new ComposePartArgs();\r
2136                 \r
2137                 /* we are looking at the character (c, c2) at [prevSrc..src[ */\r
2138                 if(nx_contains(nx, c, c2)) {\r
2139                     /* excluded: norm32==0 */\r
2140                     cc=0;\r
2141                 } else if((norm32&qcMask)==0) {\r
2142                     cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));\r
2143                 } else {\r
2144                     char[] p;\r
2145     \r
2146                     /*\r
2147                      * find appropriate boundaries around this character,\r
2148                      * decompose the source text from between the boundaries,\r
2149                      * and recompose it\r
2150                      *\r
2151                      * this puts the intermediate text into the side buffer because\r
2152                      * it might be longer than the recomposition end result,\r
2153                      * or the destination buffer may be too short or missing\r
2154                      *\r
2155                      * note that destIndex may be adjusted backwards to account\r
2156                      * for source text that passed the quick check but needed to\r
2157                      * take part in the recomposition\r
2158                      */\r
2159                     int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */\r
2160                     /*\r
2161                      * find the last true starter in [prevStarter..src[\r
2162                      * it is either the decomposition of the current character (at prevSrc),\r
2163                      * or prevStarter\r
2164                      */\r
2165                     if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {\r
2166                         prevStarter=prevSrc;\r
2167                     } else {\r
2168                         /* adjust destIndex: back out what had been copied with qc "yes" */\r
2169                         destIndex-=prevSrc-prevStarter;\r
2170                     }\r
2171                 \r
2172                     /* find the next true starter in [src..limit[ */\r
2173                     srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, \r
2174                                                decompQCMask, minNoMaybe);\r
2175                     //args.prevStarter = prevStarter;\r
2176                     args.prevCC    = prevCC;                    \r
2177                     //args.destIndex = destIndex;\r
2178                     args.length = length;\r
2179                     p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);\r
2180                         \r
2181                     if(p==null) {\r
2182                         /* an error occurred (out of memory) */\r
2183                         break;\r
2184                     }\r
2185                     \r
2186                     prevCC      = args.prevCC;\r
2187                     length      = args.length;\r
2188                     \r
2189                     /* append the recomposed buffer contents to the destination \r
2190                      * buffer */\r
2191                     if((destIndex+args.length)<=destLimit) {\r
2192                         int i=0;\r
2193                         while(i<args.length) {\r
2194                             dest[destIndex++]=p[i++];\r
2195                             --length;\r
2196                         }\r
2197                     } else {\r
2198                         /* buffer overflow */\r
2199                         /* keep incrementing the destIndex for preflighting */\r
2200                         destIndex+=length;\r
2201                     }\r
2202     \r
2203                     prevStarter=srcIndex;\r
2204                     continue;\r
2205                 }\r
2206             }\r
2207     \r
2208             /* append the single code point (c, c2) to the destination buffer */\r
2209             if((destIndex+length)<=destLimit) {\r
2210                 if(cc!=0 && cc<prevCC) {\r
2211                     /* (c, c2) is out of order with respect to the preceding \r
2212                      * text */\r
2213                     int reorderSplit= destIndex;\r
2214                     destIndex+=length;\r
2215                     prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit, \r
2216                                          destIndex, c, c2, cc);\r
2217                 } else {\r
2218                     /* just append (c, c2) */\r
2219                     dest[destIndex++]=c;\r
2220                     if(c2!=0) {\r
2221                         dest[destIndex++]=c2;\r
2222                     }\r
2223                     prevCC=cc;\r
2224                 }\r
2225             } else {\r
2226                 /* buffer overflow */\r
2227                 /* keep incrementing the destIndex for preflighting */\r
2228                 destIndex+=length;\r
2229                 prevCC=cc;\r
2230             }\r
2231         }\r
2232 \r
2233         return destIndex - destStart;\r
2234     }\r
2235     /* make FCD --------------------------------------------------------------*/\r
2236     \r
2237     private static int/*index*/ findSafeFCD(char[] src, int start, int limit, \r
2238                                             char fcd16) {\r
2239         char c, c2;\r
2240     \r
2241         /*\r
2242          * find the first position in [src..limit[ after some cc==0 according \r
2243          * to FCD data\r
2244          *\r
2245          * at the beginning of the loop, we have fcd16 from before src\r
2246          *\r
2247          * stop at positions:\r
2248          * - after trail cc==0\r
2249          * - at the end of the source\r
2250          * - before lead cc==0\r
2251          */\r
2252         for(;;) {\r
2253             /* stop if trail cc==0 for the previous character */\r
2254             if((fcd16&0xff)==0) {\r
2255                 break;\r
2256             }\r
2257     \r
2258             /* get c=*src - stop at end of string */\r
2259             if(start==limit) {\r
2260                 break;\r
2261             }\r
2262             c=src[start];\r
2263     \r
2264             /* stop if lead cc==0 for this character */\r
2265             if(c<MIN_WITH_LEAD_CC || (fcd16=getFCD16(c))==0) {\r
2266                 break; /* catches terminating NUL, too */\r
2267             }\r
2268     \r
2269             if(!UTF16.isLeadSurrogate(c)) {\r
2270                 if(fcd16<=0xff) {\r
2271                     break;\r
2272                 }\r
2273                 ++start;\r
2274             } else if(start+1!=limit && \r
2275                                     (UTF16.isTrailSurrogate(c2=src[start+1]))) {\r
2276                 /* c is a lead surrogate, get the real fcd16 */\r
2277                 fcd16=getFCD16FromSurrogatePair(fcd16, c2);\r
2278                 if(fcd16<=0xff) {\r
2279                     break;\r
2280                 }\r
2281                 start+=2;\r
2282             } else {\r
2283                 /* c is an unpaired first surrogate, lead cc==0 */\r
2284                 break;\r
2285             }\r
2286         }\r
2287     \r
2288         return start;\r
2289     }\r
2290     \r
2291     private static int/*unsigned byte*/ decomposeFCD(char[] src, \r
2292                                                      int start,int decompLimit,\r
2293                                                      char[] dest, \r
2294                                                      int[] destIndexArr,\r
2295                                                      UnicodeSet nx) {\r
2296         char[] p=null;\r
2297         int pStart=-1;\r
2298         \r
2299         long /*unsigned int*/ norm32;\r
2300         int reorderStartIndex;\r
2301         char c, c2;\r
2302         int/*unsigned byte*/ prevCC;\r
2303         DecomposeArgs args = new DecomposeArgs();\r
2304         int destIndex = destIndexArr[0];\r
2305         /*\r
2306          * canonically decompose [src..decompLimit[\r
2307          *\r
2308          * all characters in this range have some non-zero cc,\r
2309          * directly or in decomposition,\r
2310          * so that we do not need to check in the following for quick-check \r
2311          * limits etc.\r
2312          *\r
2313          * there _are_ _no_ Hangul syllables or Jamos in here because they are \r
2314          * FCD-safe (cc==0)!\r
2315          *\r
2316          * we also do not need to check for c==0 because we have an established \r
2317          * decompLimit\r
2318          */\r
2319         reorderStartIndex=destIndex;\r
2320         prevCC=0;\r
2321 \r
2322         while(start<decompLimit) {\r
2323             c=src[start++];\r
2324             norm32=getNorm32(c);\r
2325             if(isNorm32Regular(norm32)) {\r
2326                 c2=0;\r
2327                 args.length=1;\r
2328             } else {\r
2329                 /*\r
2330                  * reminder: this function is called with [src..decompLimit[\r
2331                  * not containing any Hangul/Jamo characters,\r
2332                  * therefore the only specials are lead surrogates\r
2333                  */\r
2334                 /* c is a lead surrogate, get the real norm32 */\r
2335                 if(start!=decompLimit && UTF16.isTrailSurrogate(c2=src[start])){\r
2336                     ++start;\r
2337                     args.length=2;\r
2338                     norm32=getNorm32FromSurrogatePair(norm32, c2);\r
2339                 } else {\r
2340                     c2=0;\r
2341                     args.length=1;\r
2342                     norm32=0;\r
2343                 }\r
2344             }\r
2345     \r
2346             /* get the decomposition and the lead and trail cc's */\r
2347             if(nx_contains(nx, c, c2)) {\r
2348                 /* excluded: norm32==0 */\r
2349                 args.cc=args.trailCC=0;\r
2350                 p=null;\r
2351             } else if((norm32&QC_NFD)==0) {\r
2352                 /* c does not decompose */\r
2353                 args.cc=args.trailCC=(int)((UNSIGNED_BYTE_MASK)&\r
2354                                                             (norm32>>CC_SHIFT));\r
2355                 p=null;\r
2356             } else {\r
2357                 /* c decomposes, get everything from the variable-length extra \r
2358                  * data */\r
2359                 pStart=decompose(norm32, args);\r
2360                 p=extraData;\r
2361                 if(args.length==1) {\r
2362                     /* fastpath a single code unit from decomposition */\r
2363                     c=p[pStart];\r
2364                     c2=0;\r
2365                     p=null;\r
2366                 }\r
2367             }\r
2368     \r
2369             /* append the decomposition to the destination buffer, assume \r
2370              * length>0 */\r
2371             if((destIndex+args.length)<=dest.length) {\r
2372                 int reorderSplit=destIndex;\r
2373                 if(p==null) {\r
2374                     /* fastpath: single code point */\r
2375                     if(args.cc!=0 && args.cc<prevCC) {\r
2376                         /* (c, c2) is out of order with respect to the preceding\r
2377                          *  text */\r
2378                         destIndex+=args.length;\r
2379                         args.trailCC=insertOrdered(dest,reorderStartIndex, \r
2380                                                    reorderSplit, destIndex, \r
2381                                                    c, c2, args.cc);\r
2382                     } else {\r
2383                         /* just append (c, c2) */\r
2384                         dest[destIndex++]=c;\r
2385                         if(c2!=0) {\r
2386                             dest[destIndex++]=c2;\r
2387                         }\r
2388                     }\r
2389                 } else {\r
2390                     /* general: multiple code points (ordered by themselves) \r
2391                      * from decomposition */\r
2392                     if(args.cc!=0 && args.cc<prevCC) {\r
2393                         /* the decomposition is out of order with respect to \r
2394                          * the preceding text */\r
2395                         destIndex+=args.length;\r
2396                         args.trailCC=mergeOrdered(dest,reorderStartIndex, \r
2397                                                   reorderSplit, p, pStart,\r
2398                                                   pStart+args.length);\r
2399                     } else {\r
2400                         /* just append the decomposition */\r
2401                         do {\r
2402                             dest[destIndex++]=p[pStart++];\r
2403                         } while(--args.length>0);\r
2404                     }\r
2405                 }\r
2406             } else {\r
2407                 /* buffer overflow */\r
2408                 /* keep incrementing the destIndex for preflighting */\r
2409                 destIndex+=args.length;\r
2410             }\r
2411     \r
2412             prevCC=args.trailCC;\r
2413             if(prevCC==0) {\r
2414                 reorderStartIndex=destIndex;\r
2415             }\r
2416         }\r
2417         destIndexArr[0]=destIndex;\r
2418         return prevCC;\r
2419     }\r
2420     \r
2421     public static int makeFCD(char[] src,  int srcStart,  int srcLimit,\r
2422                               char[] dest, int destStart, int destLimit,\r
2423                               UnicodeSet nx) {\r
2424                            \r
2425         int prevSrc, decompStart;\r
2426         int destIndex, length;\r
2427         char c, c2;\r
2428         int /* unsigned int*/ fcd16;\r
2429         int prevCC, cc;\r
2430     \r
2431         /* initialize */\r
2432         decompStart=srcStart;\r
2433         destIndex=destStart;\r
2434         prevCC=0;\r
2435         c=0;\r
2436         fcd16=0;\r
2437         int[] destIndexArr = new int[1];\r
2438         destIndexArr[0]=destIndex;\r
2439         \r
2440         for(;;) {\r
2441             /* skip a run of code units below the minimum or with irrelevant \r
2442              * data for the FCD check */\r
2443             prevSrc=srcStart;\r
2444 \r
2445             for(;;) {\r
2446                 if(srcStart==srcLimit) {\r
2447                     break;\r
2448                 } else if((c=src[srcStart])<MIN_WITH_LEAD_CC) {\r
2449                     prevCC=(int)-c;\r
2450                 } else if((fcd16=getFCD16(c))==0) {\r
2451                     prevCC=0;\r
2452                 } else {\r
2453                     break;\r
2454                 }\r
2455                 ++srcStart;\r
2456             }\r
2457 \r
2458     \r
2459             /*\r
2460              * prevCC has values from the following ranges:\r
2461              * 0..0xff - the previous trail combining class\r
2462              * <0      - the negative value of the previous code unit;\r
2463              *           that code unit was <_NORM_MIN_WITH_LEAD_CC and its \r
2464              *           getFCD16()\r
2465              *           was deferred so that average text is checked faster\r
2466              */\r
2467     \r
2468             /* copy these code units all at once */\r
2469             if(srcStart!=prevSrc) {\r
2470                 length=(int)(srcStart-prevSrc);\r
2471                 if((destIndex+length)<=destLimit) {\r
2472                     System.arraycopy(src,prevSrc,dest,destIndex,length);\r
2473                 }\r
2474                 destIndex+=length;\r
2475                 prevSrc=srcStart;\r
2476     \r
2477                 /* prevCC<0 is only possible from the above loop, i.e., only if\r
2478                  *  prevSrc<src */\r
2479                 if(prevCC<0) {\r
2480                     /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we \r
2481                      * need to get its trail cc */\r
2482                     if(!nx_contains(nx, (int)-prevCC)) {\r
2483                         prevCC=(int)(getFCD16((int)-prevCC)&0xff);\r
2484                     } else {\r
2485                         prevCC=0; /* excluded: fcd16==0 */\r
2486                     }\r
2487                     /*\r
2488                      * set a pointer to this below-U+0300 character;\r
2489                      * if prevCC==0 then it will moved to after this character \r
2490                      * below\r
2491                      */\r
2492                     decompStart=prevSrc-1;\r
2493                 }\r
2494             }\r
2495             /*\r
2496              * now:\r
2497              * prevSrc==src - used later to adjust destIndex before \r
2498              *          decomposition\r
2499              * prevCC>=0\r
2500              */\r
2501     \r
2502             /* end of source reached? */\r
2503             if(srcStart==srcLimit) {\r
2504                 break;\r
2505             }\r
2506     \r
2507             /* set a pointer to after the last source position where prevCC==0*/\r
2508             if(prevCC==0) {\r
2509                 decompStart=prevSrc;\r
2510             }\r
2511     \r
2512             /* c already contains *src and fcd16 is set for it, increment src */\r
2513             ++srcStart;\r
2514     \r
2515             /* check one above-minimum, relevant code unit */\r
2516             if(UTF16.isLeadSurrogate(c)) {\r
2517                 /* c is a lead surrogate, get the real fcd16 */\r
2518                 if(srcStart!=srcLimit && \r
2519                                      UTF16.isTrailSurrogate(c2=src[srcStart])) {\r
2520                     ++srcStart;\r
2521                     fcd16=getFCD16FromSurrogatePair((char)fcd16, c2);\r
2522                 } else {\r
2523                     c2=0;\r
2524                     fcd16=0;\r
2525                 }\r
2526             } else {\r
2527                 c2=0;\r
2528             }\r
2529     \r
2530             /* we are looking at the character (c, c2) at [prevSrc..src[ */\r
2531             if(nx_contains(nx, c, c2)) {\r
2532                 fcd16=0; /* excluded: fcd16==0 */\r
2533             }\r
2534             /* check the combining order, get the lead cc */\r
2535             cc=(int)(fcd16>>8);\r
2536             if(cc==0 || cc>=prevCC) {\r
2537                 /* the order is ok */\r
2538                 if(cc==0) {\r
2539                     decompStart=prevSrc;\r
2540                 }\r
2541                 prevCC=(int)(fcd16&0xff);\r
2542     \r
2543                 /* just append (c, c2) */\r
2544                 length= c2==0 ? 1 : 2;\r
2545                 if((destIndex+length)<=destLimit) {\r
2546                     dest[destIndex++]=c;\r
2547                     if(c2!=0) {\r
2548                         dest[destIndex++]=c2;\r
2549                     }\r
2550                 } else {\r
2551                     destIndex+=length;\r
2552                 }\r
2553             } else {\r
2554                 /*\r
2555                  * back out the part of the source that we copied already but\r
2556                  * is now going to be decomposed;\r
2557                  * prevSrc is set to after what was copied\r
2558                  */\r
2559                 destIndex-=(int)(prevSrc-decompStart);\r
2560     \r
2561                 /*\r
2562                  * find the part of the source that needs to be decomposed;\r
2563                  * to be safe and simple, decompose to before the next character\r
2564                  * with lead cc==0\r
2565                  */\r
2566                 srcStart=findSafeFCD(src,srcStart, srcLimit, (char)fcd16);\r
2567     \r
2568                 /*\r
2569                  * the source text does not fulfill the conditions for FCD;\r
2570                  * decompose and reorder a limited piece of the text\r
2571                  */\r
2572                 destIndexArr[0] = destIndex;\r
2573                 prevCC=decomposeFCD(src,decompStart, srcStart,dest, \r
2574                                     destIndexArr,nx);\r
2575                 decompStart=srcStart;\r
2576                 destIndex=destIndexArr[0];\r
2577             }\r
2578         }\r
2579         \r
2580         return destIndex - destStart;\r
2581     \r
2582     }\r
2583 \r
2584     public static int getCombiningClass(int c) {\r
2585         long norm32;\r
2586         norm32=getNorm32(c);\r
2587         return (char)((norm32>>CC_SHIFT)&0xFF);\r
2588     }\r
2589     \r
2590     public static boolean isFullCompositionExclusion(int c) {\r
2591         if(isFormatVersion_2_1) {\r
2592             int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);\r
2593             return (boolean)((aux & AUX_COMP_EX_MASK)!=0);\r
2594         } else {\r
2595             return false;\r
2596         }\r
2597     }\r
2598     \r
2599     public static boolean isCanonSafeStart(int c) {\r
2600         if(isFormatVersion_2_1) {\r
2601             int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);\r
2602             return (boolean)((aux & AUX_UNSAFE_MASK)==0);\r
2603         } else {\r
2604             return false;\r
2605         }\r
2606     }\r
2607     \r
2608     public static boolean getCanonStartSet(int c, USerializedSet fillSet) {\r
2609 \r
2610         if(fillSet!=null && canonStartSets!=null) {\r
2611              /*\r
2612              * binary search for c\r
2613              *\r
2614              * There are two search tables,\r
2615              * one for BMP code points and one for supplementary ones.\r
2616              * See unormimp.h for details.\r
2617              */\r
2618             char[] table;\r
2619             int i=0, start, limit;\r
2620             \r
2621             int[] idxs = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];\r
2622             char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];\r
2623             \r
2624             if(c<=0xffff) {\r
2625                 table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];\r
2626                 start=0;\r
2627                 limit=table.length;\r
2628     \r
2629                 /* each entry is a pair { c, result } */\r
2630                 while(start<limit-2) {\r
2631                     i=(char)(((start+limit)/4)*2); \r
2632                     if(c<table[i]) {\r
2633                         limit=i;\r
2634                     } else {\r
2635                         start=i;\r
2636                     }\r
2637                 }\r
2638                 //System.out.println(i);\r
2639                 /* found? */\r
2640                 if(c==table[start]) {\r
2641                     i=table[start+1];\r
2642                     if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {\r
2643                         /* result 01xxxxxx xxxxxx contains index x to a \r
2644                          * USerializedSet */\r
2645                         i&=(CANON_SET_MAX_CANON_SETS-1);\r
2646                         return fillSet.getSet(startSets,(i-idxs.length));\r
2647                     } else {\r
2648                         /* other result values are BMP code points for \r
2649                          * single-code point sets */\r
2650                         fillSet.setToOne(i);\r
2651                         return true;\r
2652                     }\r
2653                 }\r
2654             } else {\r
2655                 char high, low, h,j=0;\r
2656     \r
2657                 table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];\r
2658                 start=0;\r
2659                 limit=table.length;\r
2660     \r
2661                 high=(char)(c>>16);\r
2662                 low=(char)c;\r
2663     \r
2664                 /* each entry is a triplet { high(c), low(c), result } */\r
2665                 while(start<limit-3) {\r
2666                     /* (start+limit)/2 and address triplets */\r
2667                     i=(char)(((start+limit)/6)*3);\r
2668                     j=(char)(table[i]&0x1f); /* high word */\r
2669                     int tableVal = table[i+1];\r
2670                     int lowInt = low;\r
2671                     if(high<j || ((tableVal>lowInt) && (high==j))) {\r
2672                         limit=i;\r
2673                     } else {\r
2674                         start=i;\r
2675                     }\r
2676                     \r
2677                     //System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );\r
2678                     \r
2679                     // KLUDGE: IBM JIT in 1.4.0 is sooo broken\r
2680                     // The below lines make TestExhaustive pass\r
2681                     if(ICUDebug.enabled()){\r
2682                         System.err.println("\t\t j = " + Utility.hex(j,4) +\r
2683                                            "\t i = " + Utility.hex(i,4) +\r
2684                                            "\t high = "+ Utility.hex(high)  +\r
2685                                            "\t low = "  + Utility.hex(lowInt,4)   +\r
2686                                            "\t table[i+1]: "+ Utility.hex(tableVal,4) \r
2687                                            );\r
2688                     }\r
2689                    \r
2690                 }\r
2691 \r
2692                 /* found? */\r
2693                 h=table[start];\r
2694 \r
2695                 //System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));\r
2696                 int tableVal1 = table[start+1];\r
2697                 int lowInt = low;\r
2698 \r
2699                 if(high==(h&0x1f) && lowInt==tableVal1) {\r
2700                     int tableVal2 = table[start+2];\r
2701                     i=tableVal2;\r
2702                     if((h&0x8000)==0) {\r
2703                         /* the result is an index to a USerializedSet */\r
2704                         return fillSet.getSet(startSets,(i-idxs.length));\r
2705                     } else {\r
2706                         /*\r
2707                          * single-code point set {x} in\r
2708                          * triplet { 100xxxxx 000hhhhh  llllllll llllllll  xxxxxxxx xxxxxxxx }\r
2709                          */\r
2710                         //i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */\r
2711                         int temp = ((int)h & 0x1f00)<<8;\r
2712                         i|=temp; /* add high bits from high(c) */\r
2713                         fillSet.setToOne((int)i);\r
2714                         return true;\r
2715                     }\r
2716                 }\r
2717             }\r
2718         }\r
2719     \r
2720         return false; /* not found */\r
2721     }\r
2722     \r
2723     public static int getFC_NFKC_Closure(int c, char[] dest) {\r
2724         \r
2725         int destCapacity;\r
2726          \r
2727         if(dest==null ) {\r
2728             destCapacity=0;\r
2729         }else{\r
2730             destCapacity = dest.length;\r
2731         }\r
2732         \r
2733         int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);\r
2734 \r
2735         aux&= AUX_FNC_MASK;\r
2736         if(aux!=0) {\r
2737             int s;\r
2738             int index=aux; \r
2739             int length;\r
2740             \r
2741             s =extraData[index];\r
2742             if(s<0xff00) {\r
2743                 /* s points to the single-unit string */\r
2744                 length=1;\r
2745             } else {\r
2746                 length=s&0xff;\r
2747                 ++index;\r
2748             }\r
2749             if(0<length && length<=destCapacity) {\r
2750                 System.arraycopy(extraData,index,dest,0,length);\r
2751             }\r
2752             return length;\r
2753         } else {\r
2754             return 0;\r
2755         }\r
2756     }\r
2757 \r
2758 \r
2759     /* Is c an NF<mode>-skippable code point? See unormimp.h. */\r
2760     public static boolean isNFSkippable(int c, Normalizer.Mode mode, long mask) {\r
2761         long /*unsigned int*/ norm32;\r
2762         mask = mask & UNSIGNED_INT_MASK;\r
2763         char aux;\r
2764    \r
2765         /* check conditions (a)..(e), see unormimp.h */\r
2766         norm32 = getNorm32(c);\r
2767 \r
2768         if((norm32&mask)!=0) {\r
2769             return false; /* fails (a)..(e), not skippable */\r
2770         }\r
2771     \r
2772         if(mode == Normalizer.NFD || mode == Normalizer.NFKD || mode == Normalizer.NONE){\r
2773             return true; /* NF*D, passed (a)..(c), is skippable */\r
2774         }\r
2775         /* check conditions (a)..(e), see unormimp.h */\r
2776 \r
2777         /* NF*C/FCC, passed (a)..(e) */\r
2778         if((norm32& QC_NFD)==0) {\r
2779             return true; /* no canonical decomposition, is skippable */\r
2780         }\r
2781     \r
2782         /* check Hangul syllables algorithmically */\r
2783         if(isNorm32HangulOrJamo(norm32)) {\r
2784             /* Jamo passed (a)..(e) above, must be Hangul */\r
2785             return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */\r
2786         }\r
2787     \r
2788         /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */\r
2789         /* NF*C, test (f) flag */\r
2790         if(!isFormatVersion_2_2) {\r
2791             return false; /* no (f) data, say not skippable to be safe */\r
2792         }\r
2793     \r
2794 \r
2795         aux = AuxTrieImpl.auxTrie.getCodePointValue(c);\r
2796         return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */\r
2797     \r
2798         /* } else { FCC, test fcd<=1 instead of the above } */\r
2799     }\r
2800     \r
2801     /*\r
2802         private static final boolean\r
2803     _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {\r
2804         // add the start code point to the USet \r
2805         uset_add((USet *)context, start);\r
2806         return TRUE;\r
2807     }\r
2808     */\r
2809 \r
2810     public static UnicodeSet addPropertyStarts(UnicodeSet set) {\r
2811         int c;\r
2812        \r
2813         /* add the start code point of each same-value range of each trie */\r
2814         //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);\r
2815         TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);\r
2816         RangeValueIterator.Element normResult = new RangeValueIterator.Element();\r
2817         \r
2818         while(normIter.next(normResult)){\r
2819             set.add(normResult.start);\r
2820         }\r
2821         \r
2822         //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);\r
2823         TrieIterator fcdIter  = new TrieIterator(FCDTrieImpl.fcdTrie);\r
2824         RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();\r
2825 \r
2826         while(fcdIter.next(fcdResult)){\r
2827             set.add(fcdResult.start);\r
2828         }\r
2829         \r
2830         if(isFormatVersion_2_1){\r
2831             //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);\r
2832             TrieIterator auxIter  = new TrieIterator(AuxTrieImpl.auxTrie);\r
2833             RangeValueIterator.Element auxResult = new RangeValueIterator.Element();\r
2834             while(auxIter.next(auxResult)){\r
2835                 set.add(auxResult.start);\r
2836             }\r
2837         }\r
2838         /* add Hangul LV syllables and LV+1 because of skippables */\r
2839         for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {\r
2840             set.add(c);\r
2841             set.add(c+1);\r
2842         }\r
2843         set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */\r
2844         return set; // for chaining\r
2845     }\r
2846 \r
2847     /**\r
2848      * Internal API, used in UCharacter.getIntPropertyValue().\r
2849      * @internal\r
2850      * @param c code point\r
2851      * @param modeValue numeric value compatible with Mode\r
2852      * @return numeric value compatible with QuickCheck\r
2853      */\r
2854     public static final int quickCheck(int c, int modeValue) {\r
2855         final int qcMask[/*UNORM_MODE_COUNT*/]={\r
2856             0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC\r
2857         };\r
2858 \r
2859         int norm32=(int)getNorm32(c)&qcMask[modeValue];\r
2860 \r
2861         if(norm32==0) {\r
2862             return 1; // YES\r
2863         } else if((norm32&QC_ANY_NO)!=0) {\r
2864             return 0; // NO\r
2865         } else /* _NORM_QC_ANY_MAYBE */ {\r
2866             return 2; // MAYBE;\r
2867         }\r
2868     }\r
2869 \r
2870     /**\r
2871      * Internal API, used by collation code.\r
2872      * Get access to the internal FCD trie table to be able to perform\r
2873      * incremental, per-code unit, FCD checks in collation.\r
2874      * One pointer is sufficient because the trie index values are offset\r
2875      * by the index size, so that the same pointer is used to access the trie \r
2876      * data.\r
2877      * @internal\r
2878      */\r
2879     ///CLOVER:OFF\r
2880     public CharTrie getFCDTrie(){\r
2881         return FCDTrieImpl.fcdTrie;\r
2882     }\r
2883     ///CLOVER:ON\r
2884 \r
2885 \r
2886     \r
2887    /* compare canonically equivalent ---------------------------------------- */\r
2888 \r
2889     /*\r
2890      * Compare two strings for canonical equivalence.\r
2891      * Further options include case-insensitive comparison and\r
2892      * code point order (as opposed to code unit order).\r
2893      *\r
2894      * In this function, canonical equivalence is optional as well.\r
2895      * If canonical equivalence is tested, then both strings must fulfill\r
2896      * the FCD check.\r
2897      *\r
2898      * Semantically, this is equivalent to\r
2899      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))\r
2900      * where code point order, NFD and foldCase are all optional.\r
2901      *\r
2902      * String comparisons almost always yield results before processing both \r
2903      * strings completely.\r
2904      * They are generally more efficient working incrementally instead of\r
2905      * performing the sub-processing (strlen, normalization, case-folding)\r
2906      * on the entire strings first.\r
2907      *\r
2908      * It is also unnecessary to not normalize identical characters.\r
2909      *\r
2910      * This function works in principle as follows:\r
2911      *\r
2912      * loop {\r
2913      *   get one code unit c1 from s1 (-1 if end of source)\r
2914      *   get one code unit c2 from s2 (-1 if end of source)\r
2915      *\r
2916      *   if(either string finished) {\r
2917      *     return result;\r
2918      *   }\r
2919      *   if(c1==c2) {\r
2920      *     continue;\r
2921      *   }\r
2922      *\r
2923      *   // c1!=c2\r
2924      *   try to decompose/case-fold c1/c2, and continue if one does;\r
2925      *\r
2926      *   // still c1!=c2 and neither decomposes/case-folds, return result\r
2927      *   return c1-c2;\r
2928      * }\r
2929      *\r
2930      * When a character decomposes, then the pointer for that source changes to\r
2931      * the decomposition, pushing the previous pointer onto a stack.\r
2932      * When the end of the decomposition is reached, then the code unit reader\r
2933      * pops the previous source from the stack.\r
2934      * (Same for case-folding.)\r
2935      *\r
2936      * This is complicated further by operating on variable-width UTF-16.\r
2937      * The top part of the loop works on code units, while lookups for decomposition\r
2938      * and case-folding need code points.\r
2939      * Code points are assembled after the equality/end-of-source part.\r
2940      * The source pointer is only advanced beyond all code units when the code point\r
2941      * actually decomposes/case-folds.\r
2942      *\r
2943      * If we were on a trail surrogate unit when assembling a code point,\r
2944      * and the code point decomposes/case-folds, then the decomposition/folding\r
2945      * result must be compared with the part of the other string that corresponds to\r
2946      * this string's lead surrogate.\r
2947      * Since we only assemble a code point when hitting a trail unit when the\r
2948      * preceding lead units were identical, we back up the other string by one unit\r
2949      * in such a case.\r
2950      *\r
2951      * The optional code point order comparison at the end works with\r
2952      * the same fix-up as the other code point order comparison functions.\r
2953      * See ustring.c and the comment near the end of this function.\r
2954      *\r
2955      * Assumption: A decomposition or case-folding result string never contains\r
2956      * a single surrogate. This is a safe assumption in the Unicode Standard.\r
2957      * Therefore, we do not need to check for surrogate pairs across\r
2958      * decomposition/case-folding boundaries.\r
2959      * Further assumptions (see verifications tstnorm.cpp):\r
2960      * The API function checks for FCD first, while the core function\r
2961      * first case-folds and then decomposes. This requires that case-folding does not\r
2962      * un-FCD any strings.\r
2963      *\r
2964      * The API function may also NFD the input and turn off decomposition.\r
2965      * This requires that case-folding does not un-NFD strings either.\r
2966      *\r
2967      * TODO If any of the above two assumptions is violated,\r
2968      * then this entire code must be re-thought.\r
2969      * If this happens, then a simple solution is to case-fold both strings up front\r
2970      * and to turn off UNORM_INPUT_IS_FCD.\r
2971      * We already do this when not both strings are in FCD because makeFCD\r
2972      * would be a partial NFD before the case folding, which does not work.\r
2973      * Note that all of this is only a problem when case-folding _and_\r
2974      * canonical equivalence come together.\r
2975      * \r
2976      * This function could be moved to a different source file, at increased cost\r
2977      * for calling the decomposition access function.\r
2978      */\r
2979     \r
2980     // stack element for previous-level source/decomposition pointers\r
2981     private static class CmpEquivLevel {\r
2982         char[] source;\r
2983         int start;\r
2984         int s;\r
2985         int limit;\r
2986     }\r
2987     \r
2988     /**\r
2989      * Get the canonical decomposition for one code point.\r
2990      * @param c code point\r
2991      * @param buffer out-only buffer for algorithmic decompositions of Hangul\r
2992      * @param length out-only, takes the length of the decomposition, if any\r
2993      * @return index into the extraData array, or 0 if none\r
2994      * @internal\r
2995      */\r
2996      private static int decompose(int c, char[] buffer) {\r
2997         \r
2998         long norm32;\r
2999         int length=0;\r
3000         norm32 = (long) ((UNSIGNED_INT_MASK) & NormTrieImpl.normTrie.getCodePointValue(c));\r
3001         if((norm32 & QC_NFD)!=0) {\r
3002             if(isNorm32HangulOrJamo(norm32)) {\r
3003                 /* Hangul syllable: decompose algorithmically */\r
3004                 char c2;\r
3005     \r
3006                 c-=HANGUL_BASE;\r
3007     \r
3008                 c2=(char)(c%JAMO_T_COUNT);\r
3009                 c/=JAMO_T_COUNT;\r
3010                 if(c2>0) {\r
3011                     buffer[2]=(char)(JAMO_T_BASE+c2);\r
3012                     length=3;\r
3013                 } else {\r
3014                     length=2;\r
3015                 }\r
3016                 buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);\r
3017                 buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);\r
3018                 return length;\r
3019             } else {\r
3020                 /* normal decomposition */\r
3021                 DecomposeArgs  args = new DecomposeArgs();\r
3022                 int index = decompose(norm32, args);\r
3023                 System.arraycopy(extraData,index,buffer,0,args.length);\r
3024                 return args.length ;\r
3025             }\r
3026         } else {\r
3027             return 0;\r
3028         }\r
3029     }\r
3030 \r
3031     private static int foldCase(int c, char[] dest, int destStart, int destLimit,\r
3032                                  int options){\r
3033         String src = UTF16.valueOf(c);\r
3034         String foldedStr = UCharacter.foldCase(src,options);\r
3035         char[] foldedC = foldedStr.toCharArray();\r
3036         for(int i=0;i<foldedC.length;i++){\r
3037             if(destStart<destLimit){\r
3038                 dest[destStart]=foldedC[i];\r
3039             }\r
3040             // always increment destStart so that we can return \r
3041             // the required length\r
3042             destStart++;\r
3043         }\r
3044         return (c==UTF16.charAt(foldedStr,0)) ? -destStart : destStart;\r
3045     }\r
3046     \r
3047     /*\r
3048      private static int foldCase(char[] src,int srcStart,int srcLimit,\r
3049                                 char[] dest, int destStart, int destLimit,\r
3050                                 int options){\r
3051         String source =new String(src,srcStart,(srcLimit-srcStart));\r
3052         String foldedStr = UCharacter.foldCase(source,options);\r
3053         char[] foldedC = foldedStr.toCharArray();\r
3054         for(int i=0;i<foldedC.length;i++){\r
3055             if(destStart<destLimit){\r
3056                 dest[destStart]=foldedC[i];\r
3057             }\r
3058             // always increment destStart so that we can return \r
3059             // the required length\r
3060             destStart++;\r
3061             \r
3062         }\r
3063         return destStart;\r
3064     }\r
3065     */\r
3066     public static int cmpEquivFold(String s1, String s2,int options){\r
3067         if ((options & Normalizer.COMPARE_IGNORE_CASE) !=0) {\r
3068             return cmpSimpleEquivFold(s1, s2, options);\r
3069         }\r
3070         else {\r
3071             return cmpEquivFold(s1.toCharArray(),0,s1.length(),\r
3072                                 s2.toCharArray(),0,s2.length(),\r
3073                                 options);\r
3074         }\r
3075     }\r
3076 \r
3077 \r
3078     private static int cmpSimpleEquivFold(String s1, String s2, int options) {\r
3079         int cmp = 0;\r
3080         int i=0, j=0;\r
3081         String foldS1=null;\r
3082         String foldS2=null;\r
3083         int offset1=1;\r
3084         int offset2=1;\r
3085         while ((i+offset1<=s1.length() && j+offset2<=s2.length())) {\r
3086             if ((cmp!=0) || (s1.charAt(i) != s2.charAt(j))) {\r
3087                 if(i>0 && j>0 && \r
3088                    (UTF16.isLeadSurrogate((char)s1.charAt(i-1)) ||\r
3089                     UTF16.isLeadSurrogate((char)s2.charAt(j-1)))) {\r
3090                     // Current codepoint may be the low surrogate pair.\r
3091                     return cmpEquivFold(s1.toCharArray(),i-1,s1.length(),\r
3092                             s2.toCharArray(),j-1,s2.length(),\r
3093                             options);\r
3094                 }\r
3095                 else if (UTF16.isLeadSurrogate((char)s1.charAt(i))||\r
3096                          UTF16.isLeadSurrogate((char)s2.charAt(j))) {\r
3097                     return cmpEquivFold(s1.toCharArray(),i,s1.length(),\r
3098                             s2.toCharArray(),j,s2.length(),\r
3099                             options);\r
3100                 }\r
3101                 else {\r
3102                     if ( offset1 > 0 ) {\r
3103                         foldS1 = UCharacter.foldCase(s1.substring(i, i+offset1),options);\r
3104                     }\r
3105                     if ( offset2 > 0 ) {\r
3106                         foldS2 = UCharacter.foldCase(s2.substring(j, j+offset2),options);\r
3107                     }\r
3108                     cmp = foldS1.compareTo(foldS2);\r
3109                     if (cmp==0) {\r
3110                         i = moveToNext(i, offset1);\r
3111                         j = moveToNext(j, offset2);\r
3112                         offset1 = offset2 = 1;\r
3113                         continue;\r
3114                     }\r
3115                 }\r
3116                 if (foldS1.length()==foldS2.length()) {\r
3117                     return cmp;\r
3118                 }\r
3119                 if (foldS1.length()<foldS2.length()) {\r
3120                     offset1++;\r
3121                     offset2=0;\r
3122                 }\r
3123                 else {\r
3124                     offset1=0;\r
3125                     offset2++;\r
3126                 }\r
3127                 continue;\r
3128             }\r
3129             i++;\r
3130             j++;\r
3131         }\r
3132         if (cmp!=0) {\r
3133             return cmp;\r
3134         }\r
3135         if (i+offset1-1==s1.length()) {\r
3136             if (j+offset2-1==s2.length()) {\r
3137                 return 0;\r
3138             }\r
3139             else {\r
3140                 return -1;\r
3141             }\r
3142         }\r
3143         else {\r
3144             return 1;\r
3145         }\r
3146     }\r
3147     \r
3148     private static int moveToNext(int pos, int offset) {\r
3149         if (offset>0) {\r
3150             return pos+offset;\r
3151         }\r
3152         else {\r
3153             return pos+1;\r
3154         }\r
3155     }\r
3156     \r
3157     \r
3158     // internal function\r
3159     public static int cmpEquivFold(char[] s1, int s1Start,int s1Limit,\r
3160                                    char[] s2, int s2Start,int s2Limit,\r
3161                                    int options) {\r
3162         // current-level start/limit - s1/s2 as current\r
3163         int start1, start2, limit1, limit2;\r
3164         char[] cSource1, cSource2;\r
3165         \r
3166         cSource1 = s1;\r
3167         cSource2 = s2;\r
3168         // decomposition variables\r
3169         int length;\r
3170     \r
3171         // stacks of previous-level start/current/limit\r
3172         CmpEquivLevel[] stack1 = new CmpEquivLevel[]{ \r
3173                                                     new CmpEquivLevel(),\r
3174                                                     new CmpEquivLevel()\r
3175                                                   };\r
3176         CmpEquivLevel[] stack2 = new CmpEquivLevel[]{ \r
3177                                                     new CmpEquivLevel(),\r
3178                                                     new CmpEquivLevel()\r
3179                                                   };\r
3180     \r
3181         // decomposition buffers for Hangul\r
3182         char[] decomp1 = new char[8];\r
3183         char[] decomp2 = new char[8];\r
3184     \r
3185         // case folding buffers, only use current-level start/limit\r
3186         char[] fold1 = new char[32];\r
3187         char[] fold2 = new char[32];\r
3188     \r
3189         // track which is the current level per string\r
3190         int level1, level2;\r
3191     \r
3192         // current code units, and code points for lookups\r
3193         int c1, c2;\r
3194         int cp1, cp2;\r
3195     \r
3196         // no argument error checking because this itself is not an API\r
3197     \r
3198         // assume that at least one of the options COMPARE_EQUIV and \r
3199         // COMPARE_IGNORE_CASE is set\r
3200         // otherwise this function must behave exactly as uprv_strCompare()\r
3201         // not checking for that here makes testing this function easier\r
3202 \r
3203     \r
3204         // initialize\r
3205         start1=s1Start;\r
3206         limit1=s1Limit;\r
3207         \r
3208         start2=s2Start;\r
3209         limit2=s2Limit;\r
3210         \r
3211         level1=level2=0;\r
3212         c1=c2=-1;\r
3213         cp1=cp2=-1;\r
3214         // comparison loop\r
3215         for(;;) {\r
3216             // here a code unit value of -1 means "get another code unit"\r
3217             // below it will mean "this source is finished"\r
3218     \r
3219             if(c1<0) {\r
3220                 // get next code unit from string 1, post-increment\r
3221                 for(;;) {\r
3222                     if(s1Start>=limit1) {\r
3223                         if(level1==0) {\r
3224                             c1=-1;\r
3225                             break;\r
3226                         }\r
3227                     } else {\r
3228                         c1=cSource1[s1Start];\r
3229                         ++s1Start;\r
3230                         break;\r
3231                     }\r
3232     \r
3233                     // reached end of level buffer, pop one level\r
3234                     do {\r
3235                         --level1;\r
3236                         start1=stack1[level1].start;\r
3237                     } while(start1==-1); //###### check this\r
3238                     s1Start=stack1[level1].s;\r
3239                     limit1=stack1[level1].limit;\r
3240                     cSource1=stack1[level1].source;\r
3241                 }\r
3242             }\r
3243     \r
3244             if(c2<0) {\r
3245                 // get next code unit from string 2, post-increment\r
3246                 for(;;) {                    \r
3247                     if(s2Start>=limit2) {\r
3248                         if(level2==0) {\r
3249                             c2=-1;\r
3250                             break;\r
3251                         }\r
3252                     } else {\r
3253                         c2=cSource2[s2Start];\r
3254                         ++s2Start;\r
3255                         break;\r
3256                     }\r
3257     \r
3258                     // reached end of level buffer, pop one level\r
3259                     do {\r
3260                         --level2;\r
3261                         start2=stack2[level2].start;\r
3262                     } while(start2==-1);\r
3263                     s2Start=stack2[level2].s;\r
3264                     limit2=stack2[level2].limit;\r
3265                     cSource2=stack2[level2].source;\r
3266                 }\r
3267             }\r
3268     \r
3269             // compare c1 and c2\r
3270             // either variable c1, c2 is -1 only if the corresponding string \r
3271             // is finished\r
3272             if(c1==c2) {\r
3273                 if(c1<0) {\r
3274                     return 0;   // c1==c2==-1 indicating end of strings\r
3275                 }\r
3276                 c1=c2=-1;       // make us fetch new code units\r
3277                 continue;\r
3278             } else if(c1<0) {\r
3279                 return -1;      // string 1 ends before string 2\r
3280             } else if(c2<0) {\r
3281                 return 1;       // string 2 ends before string 1\r
3282             }\r
3283             // c1!=c2 && c1>=0 && c2>=0\r
3284     \r
3285             // get complete code points for c1, c2 for lookups if either is a \r
3286             // surrogate\r
3287             cp1=c1;\r
3288             if(UTF16.isSurrogate((char)c1)) { \r
3289                 char c;\r
3290     \r
3291                 if(UTF16.isLeadSurrogate((char)c1)) {\r
3292                     if(    s1Start!=limit1 && \r
3293                            UTF16.isTrailSurrogate(c=cSource1[s1Start])\r
3294                       ) {\r
3295                         // advance ++s1; only below if cp1 decomposes/case-folds\r
3296                         cp1=UCharacterProperty.getRawSupplementary((char)c1, c);\r
3297                     }\r
3298                 } else /* isTrail(c1) */ {\r
3299                     if(    start1<=(s1Start-2) && \r
3300                             UTF16.isLeadSurrogate(c=cSource1[(s1Start-2)])\r
3301                       ) {\r
3302                         cp1=UCharacterProperty.getRawSupplementary(c, (char)c1);\r
3303                     }\r
3304                 }\r
3305             }\r
3306             cp2=c2;\r
3307             if(UTF16.isSurrogate((char)c2)) {\r
3308                 char c;\r
3309     \r
3310                 if(UTF16.isLeadSurrogate((char)c2)) {\r
3311                     if(    s2Start!=limit2 && \r
3312                            UTF16.isTrailSurrogate(c=cSource2[s2Start])\r
3313                       ) {\r
3314                         // advance ++s2; only below if cp2 decomposes/case-folds\r
3315                         cp2=UCharacterProperty.getRawSupplementary((char)c2, c);\r
3316                     }\r
3317                 } else /* isTrail(c2) */ {\r
3318                     if(    start2<=(s2Start-2) &&  \r
3319                            UTF16.isLeadSurrogate(c=cSource2[s2Start-2])\r
3320                       ) {\r
3321                         cp2=UCharacterProperty.getRawSupplementary(c, (char)c2);\r
3322                     }\r
3323                 }\r
3324             }\r
3325     \r
3326             // go down one level for each string\r
3327             // continue with the main loop as soon as there is a real change\r
3328             if( level1<2 && ((options & Normalizer.COMPARE_IGNORE_CASE)!=0)&&\r
3329                 (length=foldCase(cp1, fold1, 0,32,options))>=0\r
3330             ) {\r
3331                 // cp1 case-folds to fold1[length]\r
3332                 if(UTF16.isSurrogate((char)c1)) {\r
3333                     if(UTF16.isLeadSurrogate((char)c1)) {\r
3334                         // advance beyond source surrogate pair if it \r
3335                         // case-folds\r
3336                         ++s1Start;\r
3337                     } else /* isTrail(c1) */ {\r
3338                         // we got a supplementary code point when hitting its \r
3339                         // trail surrogate, therefore the lead surrogate must \r
3340                         // have been the same as in the other string;\r
3341                         // compare this decomposition with the lead surrogate\r
3342                         // in the other string\r
3343                         --s2Start;\r
3344                         c2=cSource2[(s2Start-1)];\r
3345                     }\r
3346                 }\r
3347     \r
3348                 // push current level pointers\r
3349                 stack1[0].start=start1;\r
3350                 stack1[0].s=s1Start;\r
3351                 stack1[0].limit=limit1;\r
3352                 stack1[0].source=cSource1;\r
3353                 ++level1;\r
3354     \r
3355                 cSource1 = fold1;\r
3356                 start1=s1Start=0;\r
3357                 limit1=length;\r
3358     \r
3359                 // get ready to read from decomposition, continue with loop\r
3360                 c1=-1;\r
3361                 continue;\r
3362             }\r
3363     \r
3364             if( level2<2 && ((options& Normalizer.COMPARE_IGNORE_CASE)!=0) &&\r
3365                 (length=foldCase(cp2, fold2,0,32, options))>=0\r
3366             ) {\r
3367                 // cp2 case-folds to fold2[length]\r
3368                 if(UTF16.isSurrogate((char)c2)) {\r
3369                     if(UTF16.isLeadSurrogate((char)c2)) {\r
3370                         // advance beyond source surrogate pair if it \r
3371                         // case-folds\r
3372                         ++s2Start;\r
3373                     } else /* isTrail(c2) */ {\r
3374                         // we got a supplementary code point when hitting its \r
3375                         // trail surrogate, therefore the lead surrogate must \r
3376                         // have been the same as in the other string;\r
3377                         // compare this decomposition with the lead surrogate \r
3378                         // in the other string\r
3379                         --s1Start;\r
3380                         c1=cSource1[(s1Start-1)];\r
3381                     }\r
3382                 }\r
3383     \r
3384                 // push current level pointers\r
3385                 stack2[0].start=start2;\r
3386                 stack2[0].s=s2Start;\r
3387                 stack2[0].limit=limit2;\r
3388                 stack2[0].source=cSource2;\r
3389                 ++level2;\r
3390                 \r
3391                 cSource2 = fold2;\r
3392                 start2=s2Start=0;\r
3393                 limit2=length;\r
3394     \r
3395                 // get ready to read from decomposition, continue with loop\r
3396                 c2=-1;\r
3397                 continue;\r
3398             }\r
3399             \r
3400             if( level1<2 && ((options&COMPARE_EQUIV)!=0) &&\r
3401                 0!=(length=decompose(cp1,decomp1))\r
3402             ) {\r
3403                 // cp1 decomposes into p[length]\r
3404                 if(UTF16.isSurrogate((char)c1)) {\r
3405                     if(UTF16.isLeadSurrogate((char)c1)) {\r
3406                         // advance beyond source surrogate pair if it \r
3407                         //decomposes\r
3408                         ++s1Start;\r
3409                     } else /* isTrail(c1) */ {\r
3410                         // we got a supplementary code point when hitting \r
3411                         // its trail surrogate, therefore the lead surrogate \r
3412                         // must have been the same as in the other string;\r
3413                         // compare this decomposition with the lead surrogate \r
3414                         // in the other string\r
3415                         --s2Start;\r
3416                         c2=cSource2[(s2Start-1)];\r
3417                     }\r
3418                 }\r
3419     \r
3420                 // push current level pointers\r
3421                 stack1[level1].start=start1;\r
3422                 stack1[level1].s=s1Start;\r
3423                 stack1[level1].limit=limit1;\r
3424                 stack1[level1].source=cSource1;\r
3425                 ++level1;\r
3426     \r
3427                 // set next level pointers to decomposition\r
3428                 cSource1 = decomp1;\r
3429                 start1=s1Start=0;\r
3430                 limit1=length;\r
3431                 \r
3432                 // set empty intermediate level if skipped\r
3433                 if(level1<2) {\r
3434                     stack1[level1++].start=-1;\r
3435                 }\r
3436                 // get ready to read from decomposition, continue with loop\r
3437                 c1=-1;\r
3438                 continue;\r
3439             }\r
3440     \r
3441             if( level2<2 && ((options&COMPARE_EQUIV)!=0) &&\r
3442                 0!=(length=decompose(cp2, decomp2))\r
3443             ) {\r
3444                 // cp2 decomposes into p[length]\r
3445                 if(UTF16.isSurrogate((char)c2)) {\r
3446                     if(UTF16.isLeadSurrogate((char)c2)) {\r
3447                         // advance beyond source surrogate pair if it \r
3448                         // decomposes\r
3449                         ++s2Start;\r
3450                     } else /* isTrail(c2) */ {\r
3451                         // we got a supplementary code point when hitting its \r
3452                         // trail surrogate, therefore the lead surrogate must \r
3453                         // have been the same as in the other string;\r
3454                         // compare this decomposition with the lead surrogate \r
3455                         // in the other string\r
3456                         --s1Start;\r
3457                         c1=cSource1[(s1Start-1)];\r
3458                     }\r
3459                 }\r
3460     \r
3461                 // push current level pointers\r
3462                 stack2[level2].start=start2;\r
3463                 stack2[level2].s=s2Start;\r
3464                 stack2[level2].limit=limit2;\r
3465                 stack2[level2].source=cSource2;\r
3466                 ++level2;\r
3467     \r
3468                 // set next level pointers to decomposition\r
3469                 cSource2=decomp2;\r
3470                 start2=s2Start=0;\r
3471                 limit2=length;\r
3472                 \r
3473                 // set empty intermediate level if skipped\r
3474                 if(level2<2) {\r
3475                     stack2[level2++].start=-1;\r
3476                 }\r
3477                 \r
3478                 // get ready to read from decomposition, continue with loop\r
3479                 c2=-1;\r
3480                 continue;\r
3481             }\r
3482     \r
3483     \r
3484             // no decomposition/case folding, max level for both sides:\r
3485             // return difference result\r
3486     \r
3487             // code point order comparison must not just return cp1-cp2\r
3488             // because when single surrogates are present then the surrogate \r
3489             // pairs that formed cp1 and cp2 may be from different string \r
3490             // indexes\r
3491     \r
3492             // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at \r
3493             // second code units\r
3494             // c1=d800 cp1=10001 c2=dc00 cp2=10000\r
3495             // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 \r
3496             // it is { d800 10001 } < { 10000 }\r
3497             // therefore fix-up \r
3498     \r
3499             if(     c1>=0xd800 && c2>=0xd800 && \r
3500                     ((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0)\r
3501               ) {\r
3502                 /* subtract 0x2800 from BMP code points to make them smaller \r
3503                  * than supplementary ones */\r
3504                 if(\r
3505                     (    c1<=0xdbff && s1Start!=limit1 \r
3506                          && \r
3507                          UTF16.isTrailSurrogate(cSource1[s1Start])\r
3508                     ) \r
3509                      ||\r
3510                     (    UTF16.isTrailSurrogate((char)c1) && start1!=(s1Start-1) \r
3511                          && \r
3512                          UTF16.isLeadSurrogate(cSource1[(s1Start-2)])\r
3513                     )\r
3514                 ) {\r
3515                     /* part of a surrogate pair, leave >=d800 */\r
3516                 } else {\r
3517                     /* BMP code point - may be surrogate code point - \r
3518                      * make <d800 */\r
3519                     c1-=0x2800;\r
3520                 }\r
3521     \r
3522                 if(\r
3523                     (    c2<=0xdbff && s2Start!=limit2 \r
3524                          && \r
3525                          UTF16.isTrailSurrogate(cSource2[s2Start])\r
3526                     ) \r
3527                      ||\r
3528                     (    UTF16.isTrailSurrogate((char)c2) && start2!=(s2Start-1) \r
3529                          && \r
3530                          UTF16.isLeadSurrogate(cSource2[(s2Start-2)])\r
3531                     )\r
3532                 ) {\r
3533                     /* part of a surrogate pair, leave >=d800 */\r
3534                 } else {\r
3535                     /* BMP code point - may be surrogate code point - \r
3536                      * make <d800 */\r
3537                     c2-=0x2800;\r
3538                 }\r
3539             }\r
3540     \r
3541             return c1-c2;\r
3542         }\r
3543     }\r
3544     private static int strCompare(char[] s1, int s1Start, int s1Limit,\r
3545                                   char[] s2, int s2Start, int s2Limit,\r
3546                                   boolean codePointOrder) {\r
3547                         \r
3548         int start1, start2, limit1, limit2;\r
3549  \r
3550         char c1, c2;\r
3551     \r
3552         /* setup for fix-up */\r
3553         start1=s1Start;\r
3554         start2=s2Start;\r
3555         \r
3556         int length1, length2;\r
3557         \r
3558         length1 = s1Limit - s1Start;\r
3559         length2 = s2Limit - s2Start;\r
3560             \r
3561         int lengthResult;\r
3562 \r
3563         if(length1<length2) {\r
3564             lengthResult=-1;\r
3565             limit1=start1+length1;\r
3566         } else if(length1==length2) {\r
3567             lengthResult=0;\r
3568             limit1=start1+length1;\r
3569         } else /* length1>length2 */ {\r
3570             lengthResult=1;\r
3571             limit1=start1+length2;\r
3572         }\r
3573 \r
3574         if(s1==s2) {\r
3575             return lengthResult;\r
3576         }\r
3577 \r
3578         for(;;) {\r
3579             /* check pseudo-limit */\r
3580             if(s1Start==limit1) {\r
3581                 return lengthResult;\r
3582             }\r
3583 \r
3584             c1=s1[s1Start];\r
3585             c2=s2[s2Start];\r
3586             if(c1!=c2) {\r
3587                 break;\r
3588             }\r
3589             ++s1Start;\r
3590             ++s2Start;\r
3591         }\r
3592 \r
3593         /* setup for fix-up */\r
3594         limit1=start1+length1;\r
3595         limit2=start2+length2;\r
3596 \r
3597     \r
3598         /* if both values are in or above the surrogate range, fix them up */\r
3599         if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {\r
3600             /* subtract 0x2800 from BMP code points to make them smaller than\r
3601              *  supplementary ones */\r
3602             if(\r
3603                 ( c1<=0xdbff && (s1Start+1)!=limit1 && \r
3604                   UTF16.isTrailSurrogate(s1[(s1Start+1)])\r
3605                 ) ||\r
3606                 ( UTF16.isTrailSurrogate(c1) && start1!=s1Start && \r
3607                   UTF16.isLeadSurrogate(s1[(s1Start-1)])\r
3608                 )\r
3609             ) {\r
3610                 /* part of a surrogate pair, leave >=d800 */\r
3611             } else {\r
3612                 /* BMP code point - may be surrogate code point - make <d800 */\r
3613                 c1-=0x2800;\r
3614             }\r
3615     \r
3616             if(\r
3617                 ( c2<=0xdbff && (s2Start+1)!=limit2 && \r
3618                   UTF16.isTrailSurrogate(s2[(s2Start+1)])\r
3619                 ) ||\r
3620                 ( UTF16.isTrailSurrogate(c2) && start2!=s2Start && \r
3621                   UTF16.isLeadSurrogate(s2[(s2Start-1)])\r
3622                 )\r
3623             ) {\r
3624                 /* part of a surrogate pair, leave >=d800 */\r
3625             } else {\r
3626                 /* BMP code point - may be surrogate code point - make <d800 */\r
3627                 c2-=0x2800;\r
3628             }\r
3629         }\r
3630     \r
3631         /* now c1 and c2 are in UTF-32-compatible order */\r
3632         return (int)c1-(int)c2;\r
3633     }\r
3634 \r
3635 \r
3636     /*\r
3637      * Status of tailored normalization\r
3638      *\r
3639      * This was done initially for investigation on Unicode public review issue 7\r
3640      * (http://www.unicode.org/review/). See Jitterbug 2481.\r
3641      * While the UTC at meeting #94 (2003mar) did not take up the issue, this is\r
3642      * a permanent feature in ICU 2.6 in support of IDNA which requires true\r
3643      * Unicode 3.2 normalization.\r
3644      * (NormalizationCorrections are rolled into IDNA mapping tables.)\r
3645      *\r
3646      * Tailored normalization as implemented here allows to "normalize less"\r
3647      * than full Unicode normalization would.\r
3648      * Based internally on a UnicodeSet of code points that are\r
3649      * "excluded from normalization", the normalization functions leave those\r
3650      * code points alone ("inert"). This means that tailored normalization\r
3651      * still transforms text into a canonically equivalent form.\r
3652      * It does not add decompositions to code points that do not have any or\r
3653      * change decomposition results.\r
3654      *\r
3655      * Any function that searches for a safe boundary has not been touched,\r
3656      * which means that these functions will be over-pessimistic when\r
3657      * exclusions are applied.\r
3658      * This should not matter because subsequent checks and normalizations\r
3659      * do apply the exclusions; only a little more of the text may be processed\r
3660      * than necessary under exclusions.\r
3661      *\r
3662      * Normalization exclusions have the following effect on excluded code points c:\r
3663      * - c is not decomposed\r
3664      * - c is not a composition target\r
3665      * - c does not combine forward or backward for composition\r
3666      *   except that this is not implemented for Jamo\r
3667      * - c is treated as having a combining class of 0\r
3668      */\r
3669      \r
3670     /* \r
3671      * Constants for the bit fields in the options bit set parameter. \r
3672      * These need not be public. \r
3673      * A user only needs to know the currently assigned values. \r
3674      * The number and positions of reserved bits per field can remain private. \r
3675      */ \r
3676     private static final int OPTIONS_NX_MASK=0x1f;\r
3677     private static final int OPTIONS_UNICODE_MASK=0xe0; \r
3678     public  static final int OPTIONS_SETS_MASK=0xff;\r
3679     //private static final int OPTIONS_UNICODE_SHIFT=5;\r
3680     private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];\r
3681      \r
3682     /* Constants for options flags for normalization.*/\r
3683 \r
3684     /** \r
3685      * Options bit 0, do not decompose Hangul syllables. \r
3686      */\r
3687     private static final int NX_HANGUL = 1;\r
3688     /** \r
3689      * Options bit 1, do not decompose CJK compatibility characters.\r
3690      */\r
3691     private static final int NX_CJK_COMPAT=2;\r
3692     /**\r
3693      * Options bit 8, use buggy recomposition described in\r
3694      * Unicode Public Review Issue #29\r
3695      * at http://www.unicode.org/review/resolved-pri.html#pri29\r
3696      *\r
3697      * Used in IDNA implementation according to strict interpretation\r
3698      * of IDNA definition based on Unicode 3.2 which predates PRI #29.\r
3699      *\r
3700      * See ICU4C unormimp.h\r
3701      */\r
3702     public static final int BEFORE_PRI_29=0x100;\r
3703 \r
3704     /*\r
3705      * The following options are used only in some composition functions.\r
3706      * They use bits 12 and up to preserve lower bits for the available options\r
3707      * space in unorm_compare() -\r
3708      * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.\r
3709      */\r
3710 \r
3711     /** Options bit 12, for compatibility vs. canonical decomposition. */\r
3712     public static final int OPTIONS_COMPAT=0x1000;\r
3713     /** Options bit 13, no discontiguous composition (FCC vs. NFC). */\r
3714     public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;\r
3715 \r
3716     /* normalization exclusion sets --------------------------------------------- */\r
3717     \r
3718     /*\r
3719      * Normalization exclusion UnicodeSets are used for tailored normalization;\r
3720      * see the comment near the beginning of this file.\r
3721      *\r
3722      * By specifying one or several sets of code points,\r
3723      * those code points become inert for normalization.\r
3724      */\r
3725     private static final synchronized UnicodeSet internalGetNXHangul() {\r
3726         /* internal function, does not check for incoming U_FAILURE */\r
3727     \r
3728         if(nxCache[NX_HANGUL]==null) {\r
3729              nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);\r
3730         }\r
3731         return nxCache[NX_HANGUL];\r
3732     }\r
3733     \r
3734     private static final synchronized UnicodeSet internalGetNXCJKCompat() {\r
3735         /* internal function, does not check for incoming U_FAILURE */\r
3736     \r
3737         if(nxCache[NX_CJK_COMPAT]==null) {\r
3738 \r
3739             /* build a set from [CJK Ideographs]&[has canonical decomposition] */\r
3740             UnicodeSet set, hasDecomp;\r
3741     \r
3742             set=new UnicodeSet("[:Ideographic:]");\r
3743     \r
3744             /* start with an empty set for [has canonical decomposition] */\r
3745             hasDecomp=new UnicodeSet();\r
3746     \r
3747             /* iterate over all ideographs and remember which canonically decompose */\r
3748             UnicodeSetIterator it = new UnicodeSetIterator(set);\r
3749             int start, end;\r
3750             long norm32;\r
3751     \r
3752             while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {\r
3753                 start=it.codepoint;\r
3754                 end=it.codepointEnd;\r
3755                 while(start<=end) {\r
3756                     norm32 = getNorm32(start);\r
3757                     if((norm32 & QC_NFD)>0) {\r
3758                         hasDecomp.add(start);\r
3759                     }\r
3760                     ++start;\r
3761                 }\r
3762             }\r
3763     \r
3764             /* hasDecomp now contains all ideographs that decompose canonically */\r
3765              nxCache[NX_CJK_COMPAT]=hasDecomp;\r
3766          \r
3767         }\r
3768     \r
3769         return nxCache[NX_CJK_COMPAT];\r
3770     }\r
3771     \r
3772     private static final synchronized UnicodeSet internalGetNXUnicode(int options) {\r
3773         options &= OPTIONS_UNICODE_MASK;\r
3774         if(options==0) {\r
3775             return null;\r
3776         }\r
3777     \r
3778         if(nxCache[options]==null) {\r
3779             /* build a set with all code points that were not designated by the specified Unicode version */\r
3780             UnicodeSet set = new UnicodeSet();\r
3781 \r
3782             switch(options) {\r
3783             case Normalizer.UNICODE_3_2:\r
3784                 set.applyPattern("[:^Age=3.2:]");\r
3785                 break;\r
3786             default:\r
3787                 return null;\r
3788             }\r
3789             \r
3790             nxCache[options]=set;\r
3791         }\r
3792     \r
3793         return nxCache[options];\r
3794     }\r
3795     \r
3796     /* Get a decomposition exclusion set. The data must be loaded. */\r
3797     private static final synchronized UnicodeSet internalGetNX(int options) {\r
3798         options&=OPTIONS_SETS_MASK;\r
3799     \r
3800         if(nxCache[options]==null) {\r
3801             /* return basic sets */            \r
3802             if(options==NX_HANGUL) {\r
3803                 return internalGetNXHangul();\r
3804             }\r
3805             if(options==NX_CJK_COMPAT) {\r
3806                 return internalGetNXCJKCompat();\r
3807             }\r
3808             if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {\r
3809                 return internalGetNXUnicode(options);\r
3810             }\r
3811     \r
3812             /* build a set from multiple subsets */\r
3813             UnicodeSet set;\r
3814             UnicodeSet other;\r
3815     \r
3816             set=new UnicodeSet();\r
3817 \r
3818     \r
3819             if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {\r
3820                 set.addAll(other);\r
3821             }\r
3822             if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {\r
3823                 set.addAll(other);\r
3824             }\r
3825             if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {\r
3826                 set.addAll(other);\r
3827             }\r
3828 \r
3829                nxCache[options]=set;\r
3830         }\r
3831         return nxCache[options];\r
3832     }\r
3833     \r
3834     public static final UnicodeSet getNX(int options) {\r
3835         if((options&=OPTIONS_SETS_MASK)==0) {\r
3836             /* incoming failure, or no decomposition exclusions requested */\r
3837             return null;\r
3838         } else {\r
3839             return internalGetNX(options);\r
3840         }\r
3841     }\r
3842     \r
3843     private static final boolean nx_contains(UnicodeSet nx, int c) {\r
3844         return nx!=null && nx.contains(c);\r
3845     }\r
3846     \r
3847     private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {\r
3848         return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));\r
3849     }\r
3850 \r
3851 \r
3852 }\r