]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / core / src / com / ibm / icu / impl / Normalizer2Impl.java
1 /*\r
2 *******************************************************************************\r
3 *   Copyright (C) 2009-2010, International Business Machines\r
4 *   Corporation and others.  All Rights Reserved.\r
5 *******************************************************************************\r
6 */\r
7 package com.ibm.icu.impl;\r
8 \r
9 import java.io.BufferedInputStream;\r
10 import java.io.DataInputStream;\r
11 import java.io.IOException;\r
12 import java.io.InputStream;\r
13 import java.util.ArrayList;\r
14 import java.util.Iterator;\r
15 \r
16 import com.ibm.icu.text.UTF16;\r
17 import com.ibm.icu.text.UnicodeSet;\r
18 import com.ibm.icu.util.VersionInfo;\r
19 \r
20 public final class Normalizer2Impl {\r
21     public static final class Hangul {\r
22         /* Korean Hangul and Jamo constants */\r
23         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */\r
24         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */\r
25         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */\r
26 \r
27         public static final int HANGUL_BASE=0xac00;\r
28 \r
29         public static final int JAMO_L_COUNT=19;\r
30         public static final int JAMO_V_COUNT=21;\r
31         public static final int JAMO_T_COUNT=28;\r
32 \r
33         public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;\r
34         public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;\r
35 \r
36         public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;\r
37 \r
38         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;\r
39         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;\r
40 \r
41         public static boolean isHangul(int c) {\r
42             return HANGUL_BASE<=c && c<HANGUL_LIMIT;\r
43         }\r
44         public static boolean isHangulWithoutJamoT(char c) {\r
45             c-=HANGUL_BASE;\r
46             return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;\r
47         }\r
48         public static boolean isJamoL(int c) {\r
49             return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;\r
50         }\r
51         public static boolean isJamoV(int c) {\r
52             return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;\r
53         }\r
54 \r
55         /**\r
56          * Decomposes c, which must be a Hangul syllable, into buffer\r
57          * and returns the length of the decomposition (2 or 3).\r
58          */\r
59         public static int decompose(int c, Appendable buffer) {\r
60             try {\r
61                 c-=HANGUL_BASE;\r
62                 int c2=c%JAMO_T_COUNT;\r
63                 c/=JAMO_T_COUNT;\r
64                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));\r
65                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));\r
66                 if(c2==0) {\r
67                     return 2;\r
68                 } else {\r
69                     buffer.append((char)(JAMO_T_BASE+c2));\r
70                     return 3;\r
71                 }\r
72             } catch(IOException e) {\r
73                 // Will not occur because we do not write to I/O.\r
74                 throw new RuntimeException(e);\r
75             }\r
76         }\r
77     }\r
78 \r
79     /**\r
80      * Writable buffer that takes care of canonical ordering.\r
81      * Its Appendable methods behave like the C++ implementation's\r
82      * appendZeroCC() methods.\r
83      * <p>\r
84      * If dest is a StringBuilder, then the buffer writes directly to it.\r
85      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments\r
86      * until no further changes are necessary and whole segments are appended.\r
87      * append() methods that take combining-class values always write to the StringBuilder.\r
88      * Other append() methods flush and append to the Appendable.\r
89      */\r
90     public static final class ReorderingBuffer implements Appendable {\r
91         public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {\r
92             impl=ni;\r
93             app=dest;\r
94             if(app instanceof StringBuilder) {\r
95                 appIsStringBuilder=true;\r
96                 str=(StringBuilder)dest;\r
97                 // In Java, the constructor subsumes public void init(int destCapacity) {\r
98                 str.ensureCapacity(destCapacity);\r
99                 reorderStart=0;\r
100                 if(str.length()==0) {\r
101                     lastCC=0;\r
102                 } else {\r
103                     setIterator();\r
104                     lastCC=previousCC();\r
105                     // Set reorderStart after the last code point with cc<=1 if there is one.\r
106                     if(lastCC>1) {\r
107                         while(previousCC()>1) {}\r
108                     }\r
109                     reorderStart=codePointLimit;\r
110                 }\r
111             } else {\r
112                 appIsStringBuilder=false;\r
113                 str=new StringBuilder();\r
114                 reorderStart=0;\r
115                 lastCC=0;\r
116             }\r
117         }\r
118 \r
119         public boolean isEmpty() { return str.length()==0; }\r
120         public int length() { return str.length(); }\r
121         public int getLastCC() { return lastCC; }\r
122 \r
123         public StringBuilder getStringBuilder() { return str; }\r
124 \r
125         public boolean equals(CharSequence s, int start, int limit) {\r
126             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);\r
127         }\r
128 \r
129         // For Hangul composition, replacing the Leading consonant Jamo with the syllable.\r
130         public void setLastChar(char c) {\r
131             str.setCharAt(str.length()-1, c);\r
132         }\r
133 \r
134         public void append(int c, int cc) {\r
135             if(lastCC<=cc || cc==0) {\r
136                 str.appendCodePoint(c);\r
137                 lastCC=cc;\r
138                 if(cc<=1) {\r
139                     reorderStart=str.length();\r
140                 }\r
141             } else {\r
142                 insert(c, cc);\r
143             }\r
144         }\r
145         // s must be in NFD, otherwise change the implementation.\r
146         public void append(CharSequence s, int start, int limit,\r
147                            int leadCC, int trailCC) {\r
148             if(start==limit) {\r
149                 return;\r
150             }\r
151             if(lastCC<=leadCC || leadCC==0) {\r
152                 if(trailCC<=1) {\r
153                     reorderStart=str.length()+(limit-start);\r
154                 } else if(leadCC<=1) {\r
155                     reorderStart=str.length()+1;  // Ok if not a code point boundary.\r
156                 }\r
157                 str.append(s, start, limit);\r
158                 lastCC=trailCC;\r
159             } else {\r
160                 int c=Character.codePointAt(s, start);\r
161                 start+=Character.charCount(c);\r
162                 insert(c, leadCC);  // insert first code point\r
163                 while(start<limit) {\r
164                     c=Character.codePointAt(s, start);\r
165                     start+=Character.charCount(c);\r
166                     if(start<limit) {\r
167                         // s must be in NFD, otherwise we need to use getCC().\r
168                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));\r
169                     } else {\r
170                         leadCC=trailCC;\r
171                     }\r
172                     append(c, leadCC);\r
173                 }\r
174             }\r
175         }\r
176         // The following append() methods work like C++ appendZeroCC().\r
177         // They assume that the cc or trailCC of their input is 0.\r
178         // Most of them implement Appendable interface methods.\r
179         // @Override when we switch to Java 6\r
180         public ReorderingBuffer append(char c) {\r
181             str.append(c);\r
182             lastCC=0;\r
183             reorderStart=str.length();\r
184             return this;\r
185         }\r
186         public void appendZeroCC(int c) {\r
187             str.appendCodePoint(c);\r
188             lastCC=0;\r
189             reorderStart=str.length();\r
190         }\r
191         // @Override when we switch to Java 6\r
192         public ReorderingBuffer append(CharSequence s) {\r
193             if(s.length()!=0) {\r
194                 str.append(s);\r
195                 lastCC=0;\r
196                 reorderStart=str.length();\r
197             }\r
198             return this;\r
199         }\r
200         // @Override when we switch to Java 6\r
201         public ReorderingBuffer append(CharSequence s, int start, int limit) {\r
202             if(start!=limit) {\r
203                 str.append(s, start, limit);\r
204                 lastCC=0;\r
205                 reorderStart=str.length();\r
206             }\r
207             return this;\r
208         }\r
209         /**\r
210          * Flushes from the intermediate StringBuilder to the Appendable,\r
211          * if they are different objects.\r
212          * Used after recomposition.\r
213          * Must be called at the end when writing to a non-StringBuilder Appendable.\r
214          */\r
215         public void flush() {\r
216             if(appIsStringBuilder) {\r
217                 reorderStart=str.length();\r
218             } else {\r
219                 try {\r
220                     app.append(str);\r
221                     str.setLength(0);\r
222                     reorderStart=0;\r
223                 } catch(IOException e) {\r
224                     throw new RuntimeException(e);  // Avoid declaring "throws IOException".\r
225                 }\r
226             }\r
227             lastCC=0;\r
228         }\r
229         /**\r
230          * Flushes from the intermediate StringBuilder to the Appendable,\r
231          * if they are different objects.\r
232          * Then appends the new text to the Appendable or StringBuilder.\r
233          * Normally used after quick check loops find a non-empty sequence.\r
234          */\r
235         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {\r
236             if(appIsStringBuilder) {\r
237                 str.append(s, start, limit);\r
238                 reorderStart=str.length();\r
239             } else {\r
240                 try {\r
241                     app.append(str).append(s, start, limit);\r
242                     str.setLength(0);\r
243                     reorderStart=0;\r
244                 } catch(IOException e) {\r
245                     throw new RuntimeException(e);  // Avoid declaring "throws IOException".\r
246                 }\r
247             }\r
248             lastCC=0;\r
249             return this;\r
250         }\r
251         public void remove() {\r
252             str.setLength(0);\r
253             lastCC=0;\r
254             reorderStart=0;\r
255         }\r
256         public void removeSuffix(int suffixLength) {\r
257             int oldLength=str.length();\r
258             str.delete(oldLength-suffixLength, oldLength);\r
259             lastCC=0;\r
260             reorderStart=str.length();\r
261         }\r
262 \r
263         /*\r
264          * TODO: Revisit whether it makes sense to track reorderStart.\r
265          * It is set to after the last known character with cc<=1,\r
266          * which stops previousCC() before it reads that character and looks up its cc.\r
267          * previousCC() is normally only called from insert().\r
268          * In other words, reorderStart speeds up the insertion of a combining mark\r
269          * into a multi-combining mark sequence where it does not belong at the end.\r
270          * This might not be worth the trouble.\r
271          * On the other hand, it's not a huge amount of trouble.\r
272          *\r
273          * We probably need it for UNORM_SIMPLE_APPEND.\r
274          */\r
275 \r
276         // Inserts c somewhere before the last character.\r
277         // Requires 0<cc<lastCC which implies reorderStart<limit.\r
278         private void insert(int c, int cc) {\r
279             for(setIterator(), skipPrevious(); previousCC()>cc;) {}\r
280             // insert c at codePointLimit, after the character with prevCC<=cc\r
281             if(c<=0xffff) {\r
282                 str.insert(codePointLimit, (char)c);\r
283                 if(cc<=1) {\r
284                     reorderStart=codePointLimit+1;\r
285                 }\r
286             } else {\r
287                 str.insert(codePointLimit, Character.toChars(c));\r
288                 if(cc<=1) {\r
289                     reorderStart=codePointLimit+2;\r
290                 }\r
291             }\r
292         }\r
293 \r
294         private final Normalizer2Impl impl;\r
295         private final Appendable app;\r
296         private final StringBuilder str;\r
297         private final boolean appIsStringBuilder;\r
298         private int reorderStart;\r
299         private int lastCC;\r
300 \r
301         // private backward iterator\r
302         private void setIterator() { codePointStart=str.length(); }\r
303         private void skipPrevious() {  // Requires 0<codePointStart.\r
304             codePointLimit=codePointStart;\r
305             codePointStart=str.offsetByCodePoints(codePointStart, -1);\r
306         }\r
307         private int previousCC() {  // Returns 0 if there is no previous character.\r
308             codePointLimit=codePointStart;\r
309             if(reorderStart>=codePointStart) {\r
310                 return 0;\r
311             }\r
312             int c=str.codePointBefore(codePointStart);\r
313             codePointStart-=Character.charCount(c);\r
314             if(c<MIN_CCC_LCCC_CP) {\r
315                 return 0;\r
316             }\r
317             return getCCFromYesOrMaybe(impl.getNorm16(c));\r
318         }\r
319 \r
320         private int codePointStart, codePointLimit;\r
321     }\r
322 \r
323     // TODO: Propose as public API on the UTF16 class.\r
324     // TODO: Propose widening UTF16 methods that take char to take int.\r
325     // TODO: Propose widening UTF16 methods that take String to take CharSequence.\r
326     public static final class UTF16Plus {\r
327         /**\r
328          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),\r
329          * is it a lead surrogate?\r
330          * @param c code unit or code point\r
331          * @return true or false\r
332          * @draft ICU 4.6\r
333          */\r
334         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }\r
335         /**\r
336          * Compares two CharSequence objects for binary equality.\r
337          * @param s1 first sequence\r
338          * @param s2 second sequence\r
339          * @return true if s1 contains the same text as s2\r
340          * @draft ICU 4.6\r
341          */\r
342         public static boolean equal(CharSequence s1,  CharSequence s2) {\r
343             int length=s1.length();\r
344             if(length!=s2.length()) {\r
345                 return false;\r
346             }\r
347             for(int i=0; i<length; ++i) {\r
348                 if(s1.charAt(i)!=s2.charAt(i)) {\r
349                     return false;\r
350                 }\r
351             }\r
352             return true;\r
353         }\r
354         /**\r
355          * Compares two CharSequence subsequences for binary equality.\r
356          * @param s1 first sequence\r
357          * @param start1 start offset in first sequence\r
358          * @param limit1 limit offset in first sequence\r
359          * @param s2 second sequence\r
360          * @param start2 start offset in second sequence\r
361          * @param limit2 limit offset in second sequence\r
362          * @return true if s1.subSequence(start1, limit1) contains the same text\r
363          *              as s2.subSequence(start2, limit2)\r
364          * @draft ICU 4.6\r
365          */\r
366         public static boolean equal(CharSequence s1, int start1, int limit1,\r
367                                     CharSequence s2, int start2, int limit2) {\r
368             if((limit1-start1)!=(limit2-start2)) {\r
369                 return false;\r
370             }\r
371             while(start1<limit1) {\r
372                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {\r
373                     return false;\r
374                 }\r
375             }\r
376             return true;\r
377         }\r
378     }\r
379 \r
380     public Normalizer2Impl() {}\r
381 \r
382     private static final class Reader implements ICUBinary.Authenticate {\r
383         // @Override when we switch to Java 6\r
384         public boolean isDataVersionAcceptable(byte version[]) {\r
385             return version[0]==1;\r
386         }\r
387         public VersionInfo readHeader(InputStream data) throws IOException {\r
388             byte[] dataVersion=ICUBinary.readHeader(data, DATA_FORMAT, this);\r
389             return VersionInfo.getInstance(dataVersion[0], dataVersion[1],\r
390                                            dataVersion[2], dataVersion[3]);\r
391         }\r
392         private static final byte DATA_FORMAT[] = { 0x4e, 0x72, 0x6d, 0x32  };  // "Nrm2"\r
393     }\r
394     private static final Reader READER=new Reader();\r
395     public Normalizer2Impl load(InputStream data) {\r
396         try {\r
397             BufferedInputStream bis=new BufferedInputStream(data);\r
398             dataVersion=READER.readHeader(bis);\r
399             DataInputStream ds=new DataInputStream(bis);\r
400             int indexesLength=ds.readInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4\r
401             if(indexesLength<=IX_MIN_MAYBE_YES) {\r
402                 throw new IOException("Normalizer2 data: not enough indexes");\r
403             }\r
404             int[] inIndexes=new int[indexesLength];\r
405             inIndexes[0]=indexesLength*4;\r
406             for(int i=1; i<indexesLength; ++i) {\r
407                 inIndexes[i]=ds.readInt();\r
408             }\r
409     \r
410             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];\r
411             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];\r
412     \r
413             minYesNo=inIndexes[IX_MIN_YES_NO];\r
414             minNoNo=inIndexes[IX_MIN_NO_NO];\r
415             limitNoNo=inIndexes[IX_LIMIT_NO_NO];\r
416             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];\r
417     \r
418             // Read the normTrie.\r
419             int offset=inIndexes[IX_NORM_TRIE_OFFSET];\r
420             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];\r
421             normTrie=Trie2_16.createFromSerialized(ds);\r
422             int trieLength=normTrie.getSerializedLength();\r
423             if(trieLength>(nextOffset-offset)) {\r
424                 throw new IOException("Normalizer2 data: not enough bytes for normTrie");\r
425             }\r
426             ds.skipBytes((nextOffset-offset)-trieLength);  // skip padding after trie bytes\r
427     \r
428             // Read the composition and mapping data.\r
429             offset=nextOffset;\r
430             nextOffset=inIndexes[IX_RESERVED2_OFFSET];\r
431             int numChars=(nextOffset-offset)/2;\r
432             char[] chars;\r
433             if(numChars!=0) {\r
434                 chars=new char[numChars];\r
435                 for(int i=0; i<numChars; ++i) {\r
436                     chars[i]=ds.readChar();\r
437                 }\r
438                 maybeYesCompositions=new String(chars);\r
439                 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);\r
440             }\r
441             data.close();\r
442             return this;\r
443         } catch(IOException e) {\r
444             throw new RuntimeException(e);\r
445         }\r
446     }\r
447     public Normalizer2Impl load(String name) {\r
448         return load(ICUData.getRequiredStream(name));\r
449     }\r
450 \r
451     public void addPropertyStarts(UnicodeSet set) {\r
452         /* add the start code point of each same-value range of each trie */\r
453         Iterator<Trie2.Range> trieIterator=normTrie.iterator();\r
454         Trie2.Range range;\r
455         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {\r
456             /* add the start code point to the USet */\r
457             set.add(range.startCodePoint);\r
458         }\r
459 \r
460         /* add Hangul LV syllables and LV+1 because of skippables */\r
461         for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {\r
462             set.add(c);\r
463             set.add(c+1);\r
464         }\r
465         set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */\r
466     }\r
467 \r
468     public void addCanonIterPropertyStarts(UnicodeSet set) {\r
469         /* add the start code point of each same-value range of the canonical iterator data trie */\r
470         ensureCanonIterData();\r
471         // currently only used for the SEGMENT_STARTER property\r
472         Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);\r
473         Trie2.Range range;\r
474         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {\r
475             /* add the start code point to the USet */\r
476             set.add(range.startCodePoint);\r
477         }\r
478     }\r
479     private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {\r
480         public int map(int in) {\r
481             return in&CANON_NOT_SEGMENT_STARTER;\r
482         }\r
483     };\r
484 \r
485     // low-level properties ------------------------------------------------ ***\r
486 \r
487     public Trie2_16 getNormTrie() { return normTrie; }\r
488     public synchronized Trie2_16 getFCDTrie() {\r
489         if(fcdTrie!=null) {\r
490             return fcdTrie;\r
491         }\r
492         Trie2Writable newFCDTrie=new Trie2Writable(0, 0);\r
493         Iterator<Trie2.Range> trieIterator=normTrie.iterator();\r
494         Trie2.Range range;\r
495         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {\r
496             // Set the FCD value for a range of same-norm16 characters.\r
497             if(range.value!=0) {\r
498                 setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);\r
499             }\r
500         }\r
501         for(char lead=0xd800; lead<0xdc00; ++lead) {\r
502             // Collect (OR together) the FCD values for a range of supplementary characters,\r
503             // for their lead surrogate code unit.\r
504             int oredValue=newFCDTrie.get(lead);\r
505             trieIterator=normTrie.iteratorForLeadSurrogate(lead);\r
506             while(trieIterator.hasNext()) {\r
507                 oredValue|=trieIterator.next().value;\r
508             }\r
509             if(oredValue!=0) {\r
510                 // Set a "bad" value for makeFCD() to break the quick check loop\r
511                 // and look up the value for the supplementary code point.\r
512                 // If there is any lccc, then set the worst-case lccc of 1.\r
513                 // The ORed-together value's tccc is already the worst case.\r
514                 if(oredValue>0xff) {\r
515                     oredValue=0x100|(oredValue&0xff);\r
516                 }\r
517                 newFCDTrie.setForLeadSurrogateCodeUnit(lead, oredValue);\r
518             }\r
519         }\r
520         return fcdTrie=newFCDTrie.toTrie2_16();\r
521     }\r
522 \r
523     public synchronized Normalizer2Impl ensureCanonIterData() {\r
524         if(canonIterData==null) {\r
525             Trie2Writable newData=new Trie2Writable(0, 0);\r
526             canonStartSets=new ArrayList<UnicodeSet>();\r
527             Iterator<Trie2.Range> trieIterator=normTrie.iterator();\r
528             Trie2.Range range;\r
529             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {\r
530                 final int norm16=range.value;\r
531                 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {\r
532                     // Inert, or 2-way mapping (including Hangul syllable).\r
533                     // We do not write a canonStartSet for any yesNo character.\r
534                     // Composites from 2-way mappings are added at runtime from the\r
535                     // starter's compositions list, and the other characters in\r
536                     // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are\r
537                     // "maybe" characters.\r
538                     continue;\r
539                 }\r
540                 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {\r
541                     final int oldValue=newData.get(c);\r
542                     int newValue=oldValue;\r
543                     if(norm16>=minMaybeYes) {\r
544                         // not a segment starter if it occurs in a decomposition or has cc!=0\r
545                         newValue|=CANON_NOT_SEGMENT_STARTER;\r
546                         if(norm16<MIN_NORMAL_MAYBE_YES) {\r
547                             newValue|=CANON_HAS_COMPOSITIONS;\r
548                         }\r
549                     } else if(norm16<minYesNo) {\r
550                         newValue|=CANON_HAS_COMPOSITIONS;\r
551                     } else {\r
552                         // c has a one-way decomposition\r
553                         int c2=c;\r
554                         int norm16_2=norm16;\r
555                         while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {\r
556                             c2=this.mapAlgorithmic(c2, norm16_2);\r
557                             norm16_2=getNorm16(c2);\r
558                         }\r
559                         if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {\r
560                             // c decomposes, get everything from the variable-length extra data\r
561                             int firstUnit=extraData.charAt(norm16_2++);\r
562                             int length=firstUnit&MAPPING_LENGTH_MASK;\r
563                             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {\r
564                                 if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {\r
565                                     newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0\r
566                                 }\r
567                                 ++norm16_2;\r
568                             }\r
569                             // Skip empty mappings (no characters in the decomposition).\r
570                             if(length!=0) {\r
571                                 // add c to first code point's start set\r
572                                 int limit=norm16_2+length;\r
573                                 c2=extraData.codePointAt(norm16_2);\r
574                                 addToStartSet(newData, c, c2);\r
575                                 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a\r
576                                 // one-way mapping. A 2-way mapping is possible here after\r
577                                 // intermediate algorithmic mapping.\r
578                                 if(norm16_2>=minNoNo) {\r
579                                     while((norm16_2+=Character.charCount(c2))<limit) {\r
580                                         c2=extraData.codePointAt(norm16_2);\r
581                                         int c2Value=newData.get(c2);\r
582                                         if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {\r
583                                             newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);\r
584                                         }\r
585                                     }\r
586                                 }\r
587                             }\r
588                         } else {\r
589                             // c decomposed to c2 algorithmically; c has cc==0\r
590                             addToStartSet(newData, c, c2);\r
591                         }\r
592                     }\r
593                     if(newValue!=oldValue) {\r
594                         newData.set(c, newValue);\r
595                     }\r
596                 }\r
597             }\r
598             canonIterData=newData.toTrie2_32();\r
599         }\r
600         return this;\r
601     }\r
602 \r
603     public int getNorm16(int c) { return normTrie.get(c); }\r
604 \r
605     public int getCompQuickCheck(int norm16) {\r
606         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {\r
607             return 1;  // yes\r
608         } else if(minMaybeYes<=norm16) {\r
609             return 2;  // maybe\r
610         } else {\r
611             return 0;  // no\r
612         }\r
613     }\r
614     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }\r
615     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }\r
616 \r
617     public int getCC(int norm16) {\r
618         if(norm16>=MIN_NORMAL_MAYBE_YES) {\r
619             return norm16&0xff;\r
620         }\r
621         if(norm16<minNoNo || limitNoNo<=norm16) {\r
622             return 0;\r
623         }\r
624         return getCCFromNoNo(norm16);\r
625     }\r
626     public static int getCCFromYesOrMaybe(int norm16) {\r
627         return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;\r
628     }\r
629 \r
630     public int getFCD16(int c) { return fcdTrie.get(c); }\r
631     public int getFCD16FromSingleLead(char c) { return fcdTrie.getFromU16SingleLead(c); }\r
632 \r
633     void setFCD16FromNorm16(int start, int end, int norm16, Trie2Writable newFCDTrie) {\r
634         // Only loops for 1:1 algorithmic mappings.\r
635         for(;;) {\r
636             if(norm16>=MIN_NORMAL_MAYBE_YES) {\r
637                 norm16&=0xff;\r
638                 norm16|=norm16<<8;\r
639             } else if(norm16<=minYesNo || minMaybeYes<=norm16) {\r
640                 // no decomposition or Hangul syllable, all zeros\r
641                 break;\r
642             } else if(limitNoNo<=norm16) {\r
643                 int delta=norm16-(minMaybeYes-MAX_DELTA-1);\r
644                 if(start==end) {\r
645                     start+=delta;\r
646                     norm16=getNorm16(start);\r
647                 } else {\r
648                     // the same delta leads from different original characters to different mappings\r
649                     do {\r
650                         int c=start+delta;\r
651                         setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie);\r
652                     } while(++start<=end);\r
653                     break;\r
654                 }\r
655             } else {\r
656                 // c decomposes, get everything from the variable-length extra data\r
657                 int firstUnit=extraData.charAt(norm16);\r
658                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {\r
659                     // A character that is deleted (maps to an empty string) must\r
660                     // get the worst-case lccc and tccc values because arbitrary\r
661                     // characters on both sides will become adjacent.\r
662                     norm16=0x1ff;\r
663                 } else {\r
664                     if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {\r
665                         norm16=extraData.charAt(norm16+1)&0xff00;  // lccc\r
666                     } else {\r
667                         norm16=0;\r
668                     }\r
669                     norm16|=firstUnit>>8;  // tccc\r
670                 }\r
671             }\r
672             newFCDTrie.setRange(start, end, norm16, true);\r
673             break;\r
674         }\r
675     }\r
676 \r
677     /**\r
678      * Get the decomposition for one code point.\r
679      * @param c code point\r
680      * @return c's decomposition, if it has one; returns null if it does not have a decomposition\r
681      */\r
682     public String getDecomposition(int c) {\r
683         int decomp=-1;\r
684         int norm16;\r
685         for(;;) {\r
686             if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {\r
687                 // c does not decompose\r
688             } else if(isHangul(norm16)) {\r
689                 // Hangul syllable: decompose algorithmically\r
690                 StringBuilder buffer=new StringBuilder();\r
691                 Hangul.decompose(c, buffer);\r
692                 return buffer.toString();\r
693             } else if(isDecompNoAlgorithmic(norm16)) {\r
694                 decomp=c=mapAlgorithmic(c, norm16);\r
695                 continue;\r
696             } else {\r
697                 // c decomposes, get everything from the variable-length extra data\r
698                 int firstUnit=extraData.charAt(norm16++);\r
699                 int length=firstUnit&MAPPING_LENGTH_MASK;\r
700                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {\r
701                     ++norm16;\r
702                 }\r
703                 return extraData.substring(norm16, norm16+length);\r
704             }\r
705             if(decomp<0) {\r
706                 return null;\r
707             } else {\r
708                 return UTF16.valueOf(decomp);\r
709             }\r
710         }\r
711     }\r
712 \r
713     public boolean isCanonSegmentStarter(int c) {\r
714         return canonIterData.get(c)>=0;\r
715     }\r
716     public boolean getCanonStartSet(int c, UnicodeSet set) {\r
717         int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;\r
718         if(canonValue==0) {\r
719             return false;\r
720         }\r
721         set.clear();\r
722         int value=canonValue&CANON_VALUE_MASK;\r
723         if((canonValue&CANON_HAS_SET)!=0) {\r
724             set.addAll(canonStartSets.get(value));\r
725         } else if(value!=0) {\r
726             set.add(value);\r
727         }\r
728         if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {\r
729             int norm16=getNorm16(c);\r
730             if(norm16==JAMO_L) {\r
731                 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;\r
732                 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);\r
733             } else {\r
734                 addComposites(getCompositionsList(norm16), set);\r
735             }\r
736         }\r
737         return true;\r
738     }\r
739 \r
740     public static final int MIN_CCC_LCCC_CP=0x300;\r
741 \r
742     public static final int MIN_YES_YES_WITH_CC=0xff01;\r
743     public static final int JAMO_VT=0xff00;\r
744     public static final int MIN_NORMAL_MAYBE_YES=0xfe00;\r
745     public static final int JAMO_L=1;\r
746     public static final int MAX_DELTA=0x40;\r
747 \r
748     // Byte offsets from the start of the data, after the generic header.\r
749     public static final int IX_NORM_TRIE_OFFSET=0;\r
750     public static final int IX_EXTRA_DATA_OFFSET=1;\r
751     public static final int IX_RESERVED2_OFFSET=2;\r
752     public static final int IX_TOTAL_SIZE=7;\r
753 \r
754     // Code point thresholds for quick check codes.\r
755     public static final int IX_MIN_DECOMP_NO_CP=8;\r
756     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;\r
757 \r
758     // Norm16 value thresholds for quick check combinations and types of extra data.\r
759     public static final int IX_MIN_YES_NO=10;\r
760     public static final int IX_MIN_NO_NO=11;\r
761     public static final int IX_LIMIT_NO_NO=12;\r
762     public static final int IX_MIN_MAYBE_YES=13;\r
763 \r
764     public static final int IX_COUNT=16;\r
765 \r
766     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;\r
767     public static final int MAPPING_PLUS_COMPOSITION_LIST=0x40;\r
768     public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;\r
769     public static final int MAPPING_LENGTH_MASK=0x1f;\r
770 \r
771     public static final int COMP_1_LAST_TUPLE=0x8000;\r
772     public static final int COMP_1_TRIPLE=1;\r
773     public static final int COMP_1_TRAIL_LIMIT=0x3400;\r
774     public static final int COMP_1_TRAIL_MASK=0x7ffe;\r
775     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit\r
776     public static final int COMP_2_TRAIL_SHIFT=6;\r
777     public static final int COMP_2_TRAIL_MASK=0xffc0;\r
778 \r
779     // higher-level functionality ------------------------------------------ ***\r
780 \r
781     // Dual functionality:\r
782     // buffer!=NULL: normalize\r
783     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes\r
784     public int decompose(CharSequence s, int src, int limit,\r
785                          ReorderingBuffer buffer) {\r
786         int minNoCP=minDecompNoCP;\r
787 \r
788         int prevSrc;\r
789         int c=0;\r
790         int norm16=0;\r
791 \r
792         // only for quick check\r
793         int prevBoundary=src;\r
794         int prevCC=0;\r
795 \r
796         for(;;) {\r
797             // count code units below the minimum or with irrelevant data for the quick check\r
798             for(prevSrc=src; src!=limit;) {\r
799                 if( (c=s.charAt(src))<minNoCP ||\r
800                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))\r
801                 ) {\r
802                     ++src;\r
803                 } else if(!UTF16.isSurrogate((char)c)) {\r
804                     break;\r
805                 } else {\r
806                     char c2;\r
807                     if(UTF16Plus.isSurrogateLead(c)) {\r
808                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {\r
809                             c=Character.toCodePoint((char)c, c2);\r
810                         }\r
811                     } else /* trail surrogate */ {\r
812                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {\r
813                             --src;\r
814                             c=Character.toCodePoint(c2, (char)c);\r
815                         }\r
816                     }\r
817                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {\r
818                         src+=Character.charCount(c);\r
819                     } else {\r
820                         break;\r
821                     }\r
822                 }\r
823             }\r
824             // copy these code units all at once\r
825             if(src!=prevSrc) {\r
826                 if(buffer!=null) {\r
827                     buffer.flushAndAppendZeroCC(s, prevSrc, src);\r
828                 } else {\r
829                     prevCC=0;\r
830                     prevBoundary=src;\r
831                 }\r
832             }\r
833             if(src==limit) {\r
834                 break;\r
835             }\r
836 \r
837             // Check one above-minimum, relevant code point.\r
838             src+=Character.charCount(c);\r
839             if(buffer!=null) {\r
840                 decompose(c, norm16, buffer);\r
841             } else {\r
842                 if(isDecompYes(norm16)) {\r
843                     int cc=getCCFromYesOrMaybe(norm16);\r
844                     if(prevCC<=cc || cc==0) {\r
845                         prevCC=cc;\r
846                         if(cc<=1) {\r
847                             prevBoundary=src;\r
848                         }\r
849                         continue;\r
850                     }\r
851                 }\r
852                 return prevBoundary;  // "no" or cc out of order\r
853             }\r
854         }\r
855         return src;\r
856     }\r
857     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {\r
858         int limit=s.length();\r
859         if(limit==0) {\r
860             return;\r
861         }\r
862         if(doDecompose) {\r
863             decompose(s, 0, limit, buffer);\r
864             return;\r
865         }\r
866         // Just merge the strings at the boundary.\r
867         int c=Character.codePointAt(s, 0);\r
868         int src=0;\r
869         int firstCC, prevCC, cc;\r
870         firstCC=prevCC=cc=getCC(getNorm16(c));\r
871         while(cc!=0) {\r
872             prevCC=cc;\r
873             src+=Character.charCount(c);\r
874             if(src>=limit) {\r
875                 break;\r
876             }\r
877             c=Character.codePointAt(s, src);\r
878             cc=getCC(getNorm16(c));\r
879         };\r
880         buffer.append(s, 0, src, firstCC, prevCC);\r
881         buffer.append(s, src, limit);\r
882     }\r
883     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.\r
884     // doCompose: normalize\r
885     // !doCompose: isNormalized (buffer must be empty and initialized)\r
886     public boolean compose(CharSequence s, int src, int limit,\r
887                            boolean onlyContiguous,\r
888                            boolean doCompose,\r
889                            ReorderingBuffer buffer) {\r
890         int minNoMaybeCP=minCompNoMaybeCP;\r
891 \r
892         /*\r
893          * prevBoundary points to the last character before the current one\r
894          * that has a composition boundary before it with ccc==0 and quick check "yes".\r
895          * Keeping track of prevBoundary saves us looking for a composition boundary\r
896          * when we find a "no" or "maybe".\r
897          *\r
898          * When we back out from prevSrc back to prevBoundary,\r
899          * then we also remove those same characters (which had been simply copied\r
900          * or canonically-order-inserted) from the ReorderingBuffer.\r
901          * Therefore, at all times, the [prevBoundary..prevSrc[ source units\r
902          * must correspond 1:1 to destination units at the end of the destination buffer.\r
903          */\r
904         int prevBoundary=src;\r
905         int prevSrc;\r
906         int c=0;\r
907         int norm16=0;\r
908 \r
909         // only for isNormalized\r
910         int prevCC=0;\r
911 \r
912         for(;;) {\r
913             // count code units below the minimum or with irrelevant data for the quick check\r
914             for(prevSrc=src; src!=limit;) {\r
915                 if( (c=s.charAt(src))<minNoMaybeCP ||\r
916                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))\r
917                 ) {\r
918                     ++src;\r
919                 } else if(!UTF16.isSurrogate((char)c)) {\r
920                     break;\r
921                 } else {\r
922                     char c2;\r
923                     if(UTF16Plus.isSurrogateLead(c)) {\r
924                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {\r
925                             c=Character.toCodePoint((char)c, c2);\r
926                         }\r
927                     } else /* trail surrogate */ {\r
928                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {\r
929                             --src;\r
930                             c=Character.toCodePoint(c2, (char)c);\r
931                         }\r
932                     }\r
933                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {\r
934                         src+=Character.charCount(c);\r
935                     } else {\r
936                         break;\r
937                     }\r
938                 }\r
939             }\r
940             // copy these code units all at once\r
941             if(src!=prevSrc) {\r
942                 if(src==limit) {\r
943                     if(doCompose) {\r
944                         buffer.flushAndAppendZeroCC(s, prevSrc, src);\r
945                     }\r
946                     break;\r
947                 }\r
948                 // Set prevBoundary to the last character in the quick check loop.\r
949                 prevBoundary=src-1;\r
950                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&\r
951                     Character.isHighSurrogate(s.charAt(prevBoundary-1))\r
952                 ) {\r
953                     --prevBoundary;\r
954                 }\r
955                 if(doCompose) {\r
956                     // The last "quick check yes" character is excluded from the\r
957                     // flush-and-append call in case it needs to be modified.\r
958                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);\r
959                     buffer.append(s, prevBoundary, src);\r
960                 } else {\r
961                     prevCC=0;\r
962                 }\r
963                 // The start of the current character (c).\r
964                 prevSrc=src;\r
965             } else if(src==limit) {\r
966                 break;\r
967             }\r
968 \r
969             src+=Character.charCount(c);\r
970             /*\r
971              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.\r
972              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)\r
973              * or has ccc!=0.\r
974              * Check for Jamo V/T, then for regular characters.\r
975              * c is not a Hangul syllable or Jamo L because those have "yes" properties.\r
976              */\r
977             if(isJamoVT(norm16) && prevBoundary!=prevSrc) {\r
978                 char prev=s.charAt(prevSrc-1);\r
979                 boolean needToDecompose=false;\r
980                 if(c<Hangul.JAMO_T_BASE) {\r
981                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.\r
982                     prev-=Hangul.JAMO_L_BASE;\r
983                     if(prev<Hangul.JAMO_L_COUNT) {\r
984                         if(!doCompose) {\r
985                             return false;\r
986                         }\r
987                         char syllable=(char)\r
988                             (Hangul.HANGUL_BASE+\r
989                              (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*\r
990                              Hangul.JAMO_T_COUNT);\r
991                         char t;\r
992                         if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {\r
993                             ++src;\r
994                             syllable+=t;  // The next character was a Jamo T.\r
995                             prevBoundary=src;\r
996                             buffer.setLastChar(syllable);\r
997                             continue;\r
998                         }\r
999                         // If we see L+V+x where x!=T then we drop to the slow path,\r
1000                         // decompose and recompose.\r
1001                         // This is to deal with NFKC finding normal L and V but a\r
1002                         // compatibility variant of a T. We need to either fully compose that\r
1003                         // combination here (which would complicate the code and may not work\r
1004                         // with strange custom data) or use the slow path -- or else our replacing\r
1005                         // two input characters (L+V) with one output character (LV syllable)\r
1006                         // would violate the invariant that [prevBoundary..prevSrc[ has the same\r
1007                         // length as what we appended to the buffer since prevBoundary.\r
1008                         needToDecompose=true;\r
1009                     }\r
1010                 } else if(Hangul.isHangulWithoutJamoT(prev)) {\r
1011                     // c is a Jamo Trailing consonant,\r
1012                     // compose with previous Hangul LV that does not contain a Jamo T.\r
1013                     if(!doCompose) {\r
1014                         return false;\r
1015                     }\r
1016                     buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));\r
1017                     prevBoundary=src;\r
1018                     continue;\r
1019                 }\r
1020                 if(!needToDecompose) {\r
1021                     // The Jamo V/T did not compose into a Hangul syllable.\r
1022                     if(doCompose) {\r
1023                         buffer.append((char)c);\r
1024                     } else {\r
1025                         prevCC=0;\r
1026                     }\r
1027                     continue;\r
1028                 }\r
1029             }\r
1030             /*\r
1031              * Source buffer pointers:\r
1032              *\r
1033              *  all done      quick check   current char  not yet\r
1034              *                "yes" but     (c)           processed\r
1035              *                may combine\r
1036              *                forward\r
1037              * [-------------[-------------[-------------[-------------[\r
1038              * |             |             |             |             |\r
1039              * orig. src     prevBoundary  prevSrc       src           limit\r
1040              *\r
1041              *\r
1042              * Destination buffer pointers inside the ReorderingBuffer:\r
1043              *\r
1044              *  all done      might take    not filled yet\r
1045              *                characters for\r
1046              *                reordering\r
1047              * [-------------[-------------[-------------[\r
1048              * |             |             |             |\r
1049              * start         reorderStart  limit         |\r
1050              *                             +remainingCap.+\r
1051              */\r
1052             if(norm16>=MIN_YES_YES_WITH_CC) {\r
1053                 int cc=norm16&0xff;  // cc!=0\r
1054                 if( onlyContiguous &&  // FCC\r
1055                     (doCompose ? buffer.getLastCC() : prevCC)==0 &&\r
1056                     prevBoundary<prevSrc &&\r
1057                     // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that\r
1058                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)\r
1059                     // passed the quick check "yes && ccc==0" test.\r
1060                     // Check whether the last character was a "yesYes" or a "yesNo".\r
1061                     // If a "yesNo", then we get its trailing ccc from its\r
1062                     // mapping and check for canonical order.\r
1063                     // All other cases are ok.\r
1064                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc\r
1065                 ) {\r
1066                     // Fails FCD test, need to decompose and contiguously recompose.\r
1067                     if(!doCompose) {\r
1068                         return false;\r
1069                     }\r
1070                 } else if(doCompose) {\r
1071                     buffer.append(c, cc);\r
1072                     continue;\r
1073                 } else if(prevCC<=cc) {\r
1074                     prevCC=cc;\r
1075                     continue;\r
1076                 } else {\r
1077                     return false;\r
1078                 }\r
1079             } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {\r
1080                 return false;\r
1081             }\r
1082 \r
1083             /*\r
1084              * Find appropriate boundaries around this character,\r
1085              * decompose the source text from between the boundaries,\r
1086              * and recompose it.\r
1087              *\r
1088              * We may need to remove the last few characters from the ReorderingBuffer\r
1089              * to account for source text that was copied or appended\r
1090              * but needs to take part in the recomposition.\r
1091              */\r
1092 \r
1093             /*\r
1094              * Find the last composition boundary in [prevBoundary..src[.\r
1095              * It is either the decomposition of the current character (at prevSrc),\r
1096              * or prevBoundary.\r
1097              */\r
1098             if(hasCompBoundaryBefore(c, norm16)) {\r
1099                 prevBoundary=prevSrc;\r
1100             } else if(doCompose) {\r
1101                 buffer.removeSuffix(prevSrc-prevBoundary);\r
1102             }\r
1103 \r
1104             // Find the next composition boundary in [src..limit[ -\r
1105             // modifies src to point to the next starter.\r
1106             src=findNextCompBoundary(s, src, limit);\r
1107 \r
1108             // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.\r
1109             int recomposeStartIndex=buffer.length();\r
1110             decomposeShort(s, prevBoundary, src, buffer);\r
1111             recompose(buffer, recomposeStartIndex, onlyContiguous);\r
1112             if(!doCompose) {\r
1113                 if(!buffer.equals(s, prevBoundary, src)) {\r
1114                     return false;\r
1115                 }\r
1116                 buffer.remove();\r
1117                 prevCC=0;\r
1118             }\r
1119 \r
1120             // Move to the next starter. We never need to look back before this point again.\r
1121             prevBoundary=src;\r
1122         }\r
1123         return true;\r
1124     }\r
1125     /**\r
1126      * Very similar to compose(): Make the same changes in both places if relevant.\r
1127      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)\r
1128      * !doSpan: quickCheck\r
1129      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and\r
1130      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()\r
1131      *         then the quick check result is "no"\r
1132      */\r
1133     public int composeQuickCheck(CharSequence s, int src, int limit,\r
1134                                  boolean onlyContiguous, boolean doSpan) {\r
1135         int qcResult=0;\r
1136         int minNoMaybeCP=minCompNoMaybeCP;\r
1137 \r
1138         /*\r
1139          * prevBoundary points to the last character before the current one\r
1140          * that has a composition boundary before it with ccc==0 and quick check "yes".\r
1141          */\r
1142         int prevBoundary=src;\r
1143         int prevSrc;\r
1144         int c=0;\r
1145         int norm16=0;\r
1146         int prevCC=0;\r
1147 \r
1148         for(;;) {\r
1149             // count code units below the minimum or with irrelevant data for the quick check\r
1150             for(prevSrc=src;;) {\r
1151                 if(src==limit) {\r
1152                     return (src<<1)|qcResult;  // "yes" or "maybe"\r
1153                 }\r
1154                 if( (c=s.charAt(src))<minNoMaybeCP ||\r
1155                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))\r
1156                 ) {\r
1157                     ++src;\r
1158                 } else if(!UTF16.isSurrogate((char)c)) {\r
1159                     break;\r
1160                 } else {\r
1161                     char c2;\r
1162                     if(UTF16Plus.isSurrogateLead(c)) {\r
1163                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {\r
1164                             c=Character.toCodePoint((char)c, c2);\r
1165                         }\r
1166                     } else /* trail surrogate */ {\r
1167                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {\r
1168                             --src;\r
1169                             c=Character.toCodePoint(c2, (char)c);\r
1170                         }\r
1171                     }\r
1172                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {\r
1173                         src+=Character.charCount(c);\r
1174                     } else {\r
1175                         break;\r
1176                     }\r
1177                 }\r
1178             }\r
1179             if(src!=prevSrc) {\r
1180                 // Set prevBoundary to the last character in the quick check loop.\r
1181                 prevBoundary=src-1;\r
1182                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&\r
1183                         Character.isHighSurrogate(s.charAt(prevBoundary-1))\r
1184                 ) {\r
1185                     --prevBoundary;\r
1186                 }\r
1187                 prevCC=0;\r
1188                 // The start of the current character (c).\r
1189                 prevSrc=src;\r
1190             }\r
1191 \r
1192             src+=Character.charCount(c);\r
1193             /*\r
1194              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.\r
1195              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)\r
1196              * or has ccc!=0.\r
1197              */\r
1198             if(isMaybeOrNonZeroCC(norm16)) {\r
1199                 int cc=getCCFromYesOrMaybe(norm16);\r
1200                 if( onlyContiguous &&  // FCC\r
1201                     cc!=0 &&\r
1202                     prevCC==0 &&\r
1203                     prevBoundary<prevSrc &&\r
1204                     // prevCC==0 && prevBoundary<prevSrc tell us that\r
1205                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)\r
1206                     // passed the quick check "yes && ccc==0" test.\r
1207                     // Check whether the last character was a "yesYes" or a "yesNo".\r
1208                     // If a "yesNo", then we get its trailing ccc from its\r
1209                     // mapping and check for canonical order.\r
1210                     // All other cases are ok.\r
1211                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc\r
1212                 ) {\r
1213                     // Fails FCD test.\r
1214                 } else if(prevCC<=cc || cc==0) {\r
1215                     prevCC=cc;\r
1216                     if(norm16<MIN_YES_YES_WITH_CC) {\r
1217                         if(!doSpan) {\r
1218                             qcResult=1;\r
1219                         } else {\r
1220                             return prevBoundary<<1;  // spanYes does not care to know it's "maybe"\r
1221                         }\r
1222                     }\r
1223                     continue;\r
1224                 }\r
1225             }\r
1226             return prevBoundary<<1;  // "no"\r
1227         }\r
1228     }\r
1229     public void composeAndAppend(CharSequence s,\r
1230                                  boolean doCompose,\r
1231                                  boolean onlyContiguous,\r
1232                                  ReorderingBuffer buffer) {\r
1233         int src=0, limit=s.length();\r
1234         if(!buffer.isEmpty()) {\r
1235             int firstStarterInSrc=findNextCompBoundary(s, 0, limit);\r
1236             if(0!=firstStarterInSrc) {\r
1237                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),\r
1238                                                                buffer.length());\r
1239                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+\r
1240                                                        firstStarterInSrc+16);\r
1241                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());\r
1242                 buffer.removeSuffix(buffer.length()-lastStarterInDest);\r
1243                 middle.append(s, 0, firstStarterInSrc);\r
1244                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);\r
1245                 src=firstStarterInSrc;\r
1246             }\r
1247         }\r
1248         if(doCompose) {\r
1249             compose(s, src, limit, onlyContiguous, true, buffer);\r
1250         } else {\r
1251             buffer.append(s, src, limit);\r
1252         }\r
1253     }\r
1254     // Dual functionality:\r
1255     // buffer!=NULL: normalize\r
1256     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes\r
1257     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {\r
1258         // Note: In this function we use buffer->appendZeroCC() because we track\r
1259         // the lead and trail combining classes here, rather than leaving it to\r
1260         // the ReorderingBuffer.\r
1261         // The exception is the call to decomposeShort() which uses the buffer\r
1262         // in the normal way.\r
1263 \r
1264         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.\r
1265         // Similar to the prevBoundary in the compose() implementation.\r
1266         int prevBoundary=src;\r
1267         int prevSrc;\r
1268         int c=0;\r
1269         int prevFCD16=0;\r
1270         int fcd16=0;\r
1271 \r
1272         for(;;) {\r
1273             // count code units with lccc==0\r
1274             for(prevSrc=src; src!=limit;) {\r
1275                 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {\r
1276                     prevFCD16=~c;\r
1277                     ++src;\r
1278                 } else if((fcd16=fcdTrie.getFromU16SingleLead((char)c))<=0xff) {\r
1279                     prevFCD16=fcd16;\r
1280                     ++src;\r
1281                 } else if(!UTF16.isSurrogate((char)c)) {\r
1282                     break;\r
1283                 } else {\r
1284                     char c2;\r
1285                     if(UTF16Plus.isSurrogateLead(c)) {\r
1286                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {\r
1287                             c=Character.toCodePoint((char)c, c2);\r
1288                         }\r
1289                     } else /* trail surrogate */ {\r
1290                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {\r
1291                             --src;\r
1292                             c=Character.toCodePoint(c2, (char)c);\r
1293                         }\r
1294                     }\r
1295                     if((fcd16=getFCD16(c))<=0xff) {\r
1296                         prevFCD16=fcd16;\r
1297                         src+=Character.charCount(c);\r
1298                     } else {\r
1299                         break;\r
1300                     }\r
1301                 }\r
1302             }\r
1303             // copy these code units all at once\r
1304             if(src!=prevSrc) {\r
1305                 if(src==limit) {\r
1306                     if(buffer!=null) {\r
1307                         buffer.flushAndAppendZeroCC(s, prevSrc, src);\r
1308                     }\r
1309                     break;\r
1310                 }\r
1311                 prevBoundary=src;\r
1312                 // We know that the previous character's lccc==0.\r
1313                 if(prevFCD16<0) {\r
1314                     // Fetching the fcd16 value was deferred for this below-U+0300 code point.\r
1315                     prevFCD16=getFCD16FromSingleLead((char)~prevFCD16);\r
1316                     if(prevFCD16>1) {\r
1317                         --prevBoundary;\r
1318                     }\r
1319                 } else {\r
1320                     int p=src-1;\r
1321                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&\r
1322                         Character.isHighSurrogate(s.charAt(p-1))\r
1323                     ) {\r
1324                         --p;\r
1325                         // Need to fetch the previous character's FCD value because\r
1326                         // prevFCD16 was just for the trail surrogate code point.\r
1327                         prevFCD16=getFCD16(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));\r
1328                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.\r
1329                     }\r
1330                     if(prevFCD16>1) {\r
1331                         prevBoundary=p;\r
1332                     }\r
1333                 }\r
1334                 if(buffer!=null) {\r
1335                     // The last lccc==0 character is excluded from the\r
1336                     // flush-and-append call in case it needs to be modified.\r
1337                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);\r
1338                     buffer.append(s, prevBoundary, src);\r
1339                 }\r
1340                 // The start of the current character (c).\r
1341                 prevSrc=src;\r
1342             } else if(src==limit) {\r
1343                 break;\r
1344             }\r
1345 \r
1346             src+=Character.charCount(c);\r
1347             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.\r
1348             // Check for proper order, and decompose locally if necessary.\r
1349             if((prevFCD16&0xff)<=(fcd16>>8)) {\r
1350                 // proper order: prev tccc <= current lccc\r
1351                 if((fcd16&0xff)<=1) {\r
1352                     prevBoundary=src;\r
1353                 }\r
1354                 if(buffer!=null) {\r
1355                     buffer.appendZeroCC(c);\r
1356                 }\r
1357                 prevFCD16=fcd16;\r
1358                 continue;\r
1359             } else if(buffer==null) {\r
1360                 return prevBoundary;  // quick check "no"\r
1361             } else {\r
1362                 /*\r
1363                  * Back out the part of the source that we copied or appended\r
1364                  * already but is now going to be decomposed.\r
1365                  * prevSrc is set to after what was copied/appended.\r
1366                  */\r
1367                 buffer.removeSuffix(prevSrc-prevBoundary);\r
1368                 /*\r
1369                  * Find the part of the source that needs to be decomposed,\r
1370                  * up to the next safe boundary.\r
1371                  */\r
1372                 src=findNextFCDBoundary(s, src, limit);\r
1373                 /*\r
1374                  * The source text does not fulfill the conditions for FCD.\r
1375                  * Decompose and reorder a limited piece of the text.\r
1376                  */\r
1377                 decomposeShort(s, prevBoundary, src, buffer);\r
1378                 prevBoundary=src;\r
1379                 prevFCD16=0;\r
1380             }\r
1381         }\r
1382         return src;\r
1383     }\r
1384     public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {\r
1385         int src=0, limit=s.length();\r
1386         if(!buffer.isEmpty()) {\r
1387             int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);\r
1388             if(0!=firstBoundaryInSrc) {\r
1389                 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),\r
1390                                                                buffer.length());\r
1391                 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+\r
1392                                                        firstBoundaryInSrc+16);\r
1393                 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());\r
1394                 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);\r
1395                 middle.append(s, 0, firstBoundaryInSrc);\r
1396                 makeFCD(middle, 0, middle.length(), buffer);\r
1397                 src=firstBoundaryInSrc;\r
1398             }\r
1399         }\r
1400         if(doMakeFCD) {\r
1401             makeFCD(s, src, limit, buffer);\r
1402         } else {\r
1403             buffer.append(s, src, limit);\r
1404         }\r
1405     }\r
1406 \r
1407     // Note: hasDecompBoundary() could be implemented as aliases to\r
1408     // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()\r
1409     // at the cost of building the FCD trie for a decomposition normalizer.\r
1410     public boolean hasDecompBoundary(int c, boolean before) {\r
1411         for(;;) {\r
1412             if(c<minDecompNoCP) {\r
1413                 return true;\r
1414             }\r
1415             int norm16=getNorm16(c);\r
1416             if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {\r
1417                 return true;\r
1418             } else if(norm16>MIN_NORMAL_MAYBE_YES) {\r
1419                 return false;  // ccc!=0\r
1420             } else if(isDecompNoAlgorithmic(norm16)) {\r
1421                 c=mapAlgorithmic(c, norm16);\r
1422             } else {\r
1423                 // c decomposes, get everything from the variable-length extra data\r
1424                 int firstUnit=extraData.charAt(norm16++);\r
1425                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {\r
1426                     return false;\r
1427                 }\r
1428                 if(!before) {\r
1429                     // decomp after-boundary: same as hasFCDBoundaryAfter(),\r
1430                     // fcd16<=1 || trailCC==0\r
1431                     if(firstUnit>0x1ff) {\r
1432                         return false;  // trailCC>1\r
1433                     }\r
1434                     if(firstUnit<=0xff) {\r
1435                         return true;  // trailCC==0\r
1436                     }\r
1437                     // if(trailCC==1) test leadCC==0, same as checking for before-boundary\r
1438                 }\r
1439                 // true if leadCC==0 (hasFCDBoundaryBefore())\r
1440                 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16)&0xff00)==0;\r
1441             }\r
1442         }\r
1443     }\r
1444     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }\r
1445 \r
1446     public boolean hasCompBoundaryBefore(int c) {\r
1447         return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));\r
1448     }\r
1449     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) {\r
1450         for(;;) {\r
1451             int norm16=getNorm16(c);\r
1452             if(isInert(norm16)) {\r
1453                 return true;\r
1454             } else if(norm16<=minYesNo) {\r
1455                 // Hangul LVT (==minYesNo) has a boundary after it.\r
1456                 // Hangul LV and non-inert yesYes characters combine forward.\r
1457                 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);\r
1458             } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {\r
1459                 return false;\r
1460             } else if(isDecompNoAlgorithmic(norm16)) {\r
1461                 c=mapAlgorithmic(c, norm16);\r
1462             } else {\r
1463                 // c decomposes, get everything from the variable-length extra data.\r
1464                 // If testInert, then c must be a yesNo character which has lccc=0,\r
1465                 // otherwise it could be a noNo.\r
1466                 int firstUnit=extraData.charAt(norm16);\r
1467                 // true if\r
1468                 //      c is not deleted, and\r
1469                 //      it and its decomposition do not combine forward, and it has a starter, and\r
1470                 //      if FCC then trailCC<=1\r
1471                 return\r
1472                     (firstUnit&MAPPING_LENGTH_MASK)!=0 &&\r
1473                     (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&\r
1474                     (!onlyContiguous || firstUnit<=0x1ff);\r
1475             }\r
1476         }\r
1477     }\r
1478 \r
1479     public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }\r
1480     public boolean hasFCDBoundaryAfter(int c) {\r
1481         int fcd16=getFCD16(c);\r
1482         return fcd16<=1 || (fcd16&0xff)==0;\r
1483     }\r
1484     public boolean isFCDInert(int c) { return getFCD16(c)<=1; }\r
1485 \r
1486     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }\r
1487     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }\r
1488     private static boolean isInert(int norm16) { return norm16==0; }\r
1489     // static UBool isJamoL(uint16_t norm16) const { return norm16==1; }\r
1490     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }\r
1491     private boolean isHangul(int norm16) { return norm16==minYesNo; }\r
1492     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }\r
1493     // UBool isCompYes(uint16_t norm16) const {\r
1494     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;\r
1495     // }\r
1496     // UBool isCompYesOrMaybe(uint16_t norm16) const {\r
1497     //     return norm16<minNoNo || minMaybeYes<=norm16;\r
1498     // }\r
1499     // private boolean hasZeroCCFromDecompYes(int norm16) {\r
1500     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;\r
1501     // }\r
1502     private boolean isDecompYesAndZeroCC(int norm16) {\r
1503         return norm16<minYesNo ||\r
1504                norm16==JAMO_VT ||\r
1505                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);\r
1506     }\r
1507     /**\r
1508      * A little faster and simpler than isDecompYesAndZeroCC() but does not include\r
1509      * the MaybeYes which combine-forward and have ccc=0.\r
1510      * (Standard Unicode 5.2 normalization does not have such characters.)\r
1511      */\r
1512     private boolean isMostDecompYesAndZeroCC(int norm16) {\r
1513         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;\r
1514     }\r
1515     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }\r
1516 \r
1517     // For use with isCompYes().\r
1518     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.\r
1519     // static uint8_t getCCFromYes(uint16_t norm16) {\r
1520     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;\r
1521     // }\r
1522     private int getCCFromNoNo(int norm16) {\r
1523         if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {\r
1524             return extraData.charAt(norm16+1)&0xff;\r
1525         } else {\r
1526             return 0;\r
1527         }\r
1528     }\r
1529     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()\r
1530     int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {\r
1531         int c;\r
1532         if(cpStart==(cpLimit-1)) {\r
1533             c=s.charAt(cpStart);\r
1534         } else {\r
1535             c=Character.codePointAt(s, cpStart);\r
1536         }\r
1537         int prevNorm16=getNorm16(c);\r
1538         if(prevNorm16<=minYesNo) {\r
1539             return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0\r
1540         } else {\r
1541             return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo\r
1542         }\r
1543     }\r
1544 \r
1545     // Requires algorithmic-NoNo.\r
1546     private int mapAlgorithmic(int c, int norm16) {\r
1547         return c+norm16-(minMaybeYes-MAX_DELTA-1);\r
1548     }\r
1549 \r
1550     // Requires minYesNo<norm16<limitNoNo.\r
1551     // private int getMapping(int norm16) { return /*extraData+*/norm16; }\r
1552 \r
1553     /**\r
1554      * @return index into maybeYesCompositions, or -1\r
1555      */\r
1556     private int getCompositionsListForDecompYes(int norm16) {\r
1557         if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {\r
1558             return -1;\r
1559         } else {\r
1560             if((norm16-=minMaybeYes)<0) {\r
1561                 // norm16<minMaybeYes: index into extraData which is a substring at\r
1562                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]\r
1563                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16\r
1564                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list\r
1565             }\r
1566             return norm16;\r
1567         }\r
1568     }\r
1569     /**\r
1570      * @return index into maybeYesCompositions\r
1571      */\r
1572     private int getCompositionsListForComposite(int norm16) {\r
1573         // composite has both mapping & compositions list\r
1574         int firstUnit=extraData.charAt(norm16);\r
1575         return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions\r
1576             1+  // +1 to skip the first unit with the mapping lenth\r
1577             (firstUnit&MAPPING_LENGTH_MASK)+  // + mapping length\r
1578             ((firstUnit>>7)&1);  // +1 if MAPPING_HAS_CCC_LCCC_WORD\r
1579     }\r
1580     /**\r
1581      * @param c code point must have compositions\r
1582      * @return index into maybeYesCompositions\r
1583      */\r
1584     private int getCompositionsList(int norm16) {\r
1585         return isDecompYes(norm16) ?\r
1586                 getCompositionsListForDecompYes(norm16) :\r
1587                 getCompositionsListForComposite(norm16);\r
1588     }\r
1589 \r
1590     // Decompose a short piece of text which is likely to contain characters that\r
1591     // fail the quick check loop and/or where the quick check loop's overhead\r
1592     // is unlikely to be amortized.\r
1593     // Called by the compose() and makeFCD() implementations.\r
1594     // Public in Java for collation implementation code.\r
1595     public void decomposeShort(CharSequence s, int src, int limit,\r
1596                                ReorderingBuffer buffer) {\r
1597         while(src<limit) {\r
1598             int c=Character.codePointAt(s, src);\r
1599             src+=Character.charCount(c);\r
1600             decompose(c, getNorm16(c), buffer);\r
1601         }\r
1602     }\r
1603     private void decompose(int c, int norm16,\r
1604                            ReorderingBuffer buffer) {\r
1605         // Only loops for 1:1 algorithmic mappings.\r
1606         for(;;) {\r
1607             // get the decomposition and the lead and trail cc's\r
1608             if(isDecompYes(norm16)) {\r
1609                 // c does not decompose\r
1610                 buffer.append(c, getCCFromYesOrMaybe(norm16));\r
1611             } else if(isHangul(norm16)) {\r
1612                 // Hangul syllable: decompose algorithmically\r
1613                 Hangul.decompose(c, buffer);\r
1614             } else if(isDecompNoAlgorithmic(norm16)) {\r
1615                 c=mapAlgorithmic(c, norm16);\r
1616                 norm16=getNorm16(c);\r
1617                 continue;\r
1618             } else {\r
1619                 // c decomposes, get everything from the variable-length extra data\r
1620                 int firstUnit=extraData.charAt(norm16++);\r
1621                 int length=firstUnit&MAPPING_LENGTH_MASK;\r
1622                 int leadCC, trailCC;\r
1623                 trailCC=firstUnit>>8;\r
1624                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {\r
1625                     leadCC=extraData.charAt(norm16++)>>8;\r
1626                 } else {\r
1627                     leadCC=0;\r
1628                 }\r
1629                 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);\r
1630             }\r
1631             return;\r
1632         }\r
1633     }\r
1634 \r
1635     /*\r
1636      * Finds the recomposition result for\r
1637      * a forward-combining "lead" character,\r
1638      * specified with a pointer to its compositions list,\r
1639      * and a backward-combining "trail" character.\r
1640      *\r
1641      * If the lead and trail characters combine, then this function returns\r
1642      * the following "compositeAndFwd" value:\r
1643      * Bits 21..1  composite character\r
1644      * Bit      0  set if the composite is a forward-combining starter\r
1645      * otherwise it returns -1.\r
1646      *\r
1647      * The compositions list has (trail, compositeAndFwd) pair entries,\r
1648      * encoded as either pairs or triples of 16-bit units.\r
1649      * The last entry has the high bit of its first unit set.\r
1650      *\r
1651      * The list is sorted by ascending trail characters (there are no duplicates).\r
1652      * A linear search is used.\r
1653      *\r
1654      * See normalizer2impl.h for a more detailed description\r
1655      * of the compositions list format.\r
1656      */\r
1657     private static int combine(String compositions, int list, int trail) {\r
1658         int key1, firstUnit;\r
1659         if(trail<COMP_1_TRAIL_LIMIT) {\r
1660             // trail character is 0..33FF\r
1661             // result entry may have 2 or 3 units\r
1662             key1=(trail<<1);\r
1663             while(key1>(firstUnit=compositions.charAt(list))) {\r
1664                 list+=2+(firstUnit&COMP_1_TRIPLE);\r
1665             }\r
1666             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {\r
1667                 if((firstUnit&COMP_1_TRIPLE)!=0) {\r
1668                     return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);\r
1669                 } else {\r
1670                     return compositions.charAt(list+1);\r
1671                 }\r
1672             }\r
1673         } else {\r
1674             // trail character is 3400..10FFFF\r
1675             // result entry has 3 units\r
1676             key1=COMP_1_TRAIL_LIMIT+((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE;\r
1677             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;\r
1678             int secondUnit;\r
1679             for(;;) {\r
1680                 if(key1>(firstUnit=compositions.charAt(list))) {\r
1681                     list+=2+(firstUnit&COMP_1_TRIPLE);\r
1682                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {\r
1683                     if(key2>(secondUnit=compositions.charAt(list+1))) {\r
1684                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {\r
1685                             break;\r
1686                         } else {\r
1687                             list+=3;\r
1688                         }\r
1689                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {\r
1690                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);\r
1691                     } else {\r
1692                         break;\r
1693                     }\r
1694                 } else {\r
1695                     break;\r
1696                 }\r
1697             }\r
1698         }\r
1699         return -1;\r
1700     }\r
1701     /**\r
1702      * @param c Character which has compositions\r
1703      * @param set recursively receives the composites from c's compositions\r
1704      */\r
1705     private void addComposites(int list, UnicodeSet set) {\r
1706         int firstUnit, compositeAndFwd;\r
1707         do {\r
1708             firstUnit=maybeYesCompositions.charAt(list);\r
1709             if((firstUnit&COMP_1_TRIPLE)==0) {\r
1710                 compositeAndFwd=maybeYesCompositions.charAt(list+1);\r
1711                 list+=2;\r
1712             } else {\r
1713                 compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|\r
1714                                 maybeYesCompositions.charAt(list+2);\r
1715                 list+=3;\r
1716             }\r
1717             int composite=compositeAndFwd>>1;\r
1718             if((compositeAndFwd&1)!=0) {\r
1719                 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);\r
1720             }\r
1721             set.add(composite);\r
1722         } while((firstUnit&COMP_1_LAST_TUPLE)==0);\r
1723     }\r
1724     /*\r
1725      * Recomposes the buffer text starting at recomposeStartIndex\r
1726      * (which is in NFD - decomposed and canonically ordered),\r
1727      * and truncates the buffer contents.\r
1728      *\r
1729      * Note that recomposition never lengthens the text:\r
1730      * Any character consists of either one or two code units;\r
1731      * a composition may contain at most one more code unit than the original starter,\r
1732      * while the combining mark that is removed has at least one code unit.\r
1733      */\r
1734     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,\r
1735                            boolean onlyContiguous) {\r
1736         StringBuilder sb=buffer.getStringBuilder();\r
1737         int p=recomposeStartIndex;\r
1738         if(p==sb.length()) {\r
1739             return;\r
1740         }\r
1741 \r
1742         int starter, pRemove;\r
1743         int compositionsList;\r
1744         int c, compositeAndFwd;\r
1745         int norm16;\r
1746         int cc, prevCC;\r
1747         boolean starterIsSupplementary;\r
1748 \r
1749         // Some of the following variables are not used until we have a forward-combining starter\r
1750         // and are only initialized now to avoid compiler warnings.\r
1751         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter\r
1752         starter=-1;\r
1753         starterIsSupplementary=false;\r
1754         prevCC=0;\r
1755 \r
1756         for(;;) {\r
1757             c=sb.codePointAt(p);\r
1758             p+=Character.charCount(c);\r
1759             norm16=getNorm16(c);\r
1760             cc=getCCFromYesOrMaybe(norm16);\r
1761             if( // this character combines backward and\r
1762                 isMaybe(norm16) &&\r
1763                 // we have seen a starter that combines forward and\r
1764                 compositionsList>=0 &&\r
1765                 // the backward-combining character is not blocked\r
1766                 (prevCC<cc || prevCC==0)\r
1767             ) {\r
1768                 if(isJamoVT(norm16)) {\r
1769                     // c is a Jamo V/T, see if we can compose it with the previous character.\r
1770                     if(c<Hangul.JAMO_T_BASE) {\r
1771                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.\r
1772                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);\r
1773                         if(prev<Hangul.JAMO_L_COUNT) {\r
1774                             pRemove=p-1;\r
1775                             char syllable=(char)\r
1776                                 (Hangul.HANGUL_BASE+\r
1777                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*\r
1778                                  Hangul.JAMO_T_COUNT);\r
1779                             char t;\r
1780                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {\r
1781                                 ++p;\r
1782                                 syllable+=t;  // The next character was a Jamo T.\r
1783                             }\r
1784                             sb.setCharAt(starter, syllable);\r
1785                             // remove the Jamo V/T\r
1786                             sb.delete(pRemove, p);\r
1787                             p=pRemove;\r
1788                         }\r
1789                     }\r
1790                     /*\r
1791                      * No "else" for Jamo T:\r
1792                      * Since the input is in NFD, there are no Hangul LV syllables that\r
1793                      * a Jamo T could combine with.\r
1794                      * All Jamo Ts are combined above when handling Jamo Vs.\r
1795                      */\r
1796                     if(p==sb.length()) {\r
1797                         break;\r
1798                     }\r
1799                     compositionsList=-1;\r
1800                     continue;\r
1801                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {\r
1802                     // The starter and the combining mark (c) do combine.\r
1803                     int composite=compositeAndFwd>>1;\r
1804 \r
1805                     // Remove the combining mark.\r
1806                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark\r
1807                     sb.delete(pRemove, p);\r
1808                     p=pRemove;\r
1809                     // Replace the starter with the composite.\r
1810                     if(starterIsSupplementary) {\r
1811                         if(composite>0xffff) {\r
1812                             // both are supplementary\r
1813                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));\r
1814                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));\r
1815                         } else {\r
1816                             sb.setCharAt(starter, (char)c);\r
1817                             sb.deleteCharAt(starter+1);\r
1818                             // The composite is shorter than the starter,\r
1819                             // move the intermediate characters forward one.\r
1820                             starterIsSupplementary=false;\r
1821                             --p;\r
1822                         }\r
1823                     } else if(composite>0xffff) {\r
1824                         // The composite is longer than the starter,\r
1825                         // move the intermediate characters back one.\r
1826                         starterIsSupplementary=true;\r
1827                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));\r
1828                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));\r
1829                         ++p;\r
1830                     } else {\r
1831                         // both are on the BMP\r
1832                         sb.setCharAt(starter, (char)composite);\r
1833                     }\r
1834 \r
1835                     // Keep prevCC because we removed the combining mark.\r
1836 \r
1837                     if(p==sb.length()) {\r
1838                         break;\r
1839                     }\r
1840                     // Is the composite a starter that combines forward?\r
1841                     if((compositeAndFwd&1)!=0) {\r
1842                         compositionsList=\r
1843                             getCompositionsListForComposite(getNorm16(composite));\r
1844                     } else {\r
1845                         compositionsList=-1;\r
1846                     }\r
1847 \r
1848                     // We combined; continue with looking for compositions.\r
1849                     continue;\r
1850                 }\r
1851             }\r
1852 \r
1853             // no combination this time\r
1854             prevCC=cc;\r
1855             if(p==sb.length()) {\r
1856                 break;\r
1857             }\r
1858 \r
1859             // If c did not combine, then check if it is a starter.\r
1860             if(cc==0) {\r
1861                 // Found a new starter.\r
1862                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {\r
1863                     // It may combine with something, prepare for it.\r
1864                     if(c<=0xffff) {\r
1865                         starterIsSupplementary=false;\r
1866                         starter=p-1;\r
1867                     } else {\r
1868                         starterIsSupplementary=true;\r
1869                         starter=p-2;\r
1870                     }\r
1871                 }\r
1872             } else if(onlyContiguous) {\r
1873                 // FCC: no discontiguous compositions; any intervening character blocks.\r
1874                 compositionsList=-1;\r
1875             }\r
1876         }\r
1877         buffer.flush();\r
1878     }\r
1879 \r
1880     /**\r
1881      * Does c have a composition boundary before it?\r
1882      * True if its decomposition begins with a character that has\r
1883      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).\r
1884      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes\r
1885      * (isCompYesAndZeroCC()) so we need not decompose.\r
1886      */\r
1887     private boolean hasCompBoundaryBefore(int c, int norm16) {\r
1888         for(;;) {\r
1889             if(isCompYesAndZeroCC(norm16)) {\r
1890                 return true;\r
1891             } else if(isMaybeOrNonZeroCC(norm16)) {\r
1892                 return false;\r
1893             } else if(isDecompNoAlgorithmic(norm16)) {\r
1894                 c=mapAlgorithmic(c, norm16);\r
1895                 norm16=getNorm16(c);\r
1896             } else {\r
1897                 // c decomposes, get everything from the variable-length extra data\r
1898                 int firstUnit=extraData.charAt(norm16++);\r
1899                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {\r
1900                     return false;\r
1901                 }\r
1902                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16++)&0xff00)!=0) {\r
1903                     return false;  // non-zero leadCC\r
1904                 }\r
1905                 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16)));\r
1906             }\r
1907         }\r
1908     }\r
1909     private int findPreviousCompBoundary(CharSequence s, int p) {\r
1910         while(p>0) {\r
1911             int c=Character.codePointBefore(s, p);\r
1912             p-=Character.charCount(c);\r
1913             if(hasCompBoundaryBefore(c)) {\r
1914                 break;\r
1915             }\r
1916             // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,\r
1917             // but that's probably not worth the extra cost.\r
1918         }\r
1919         return p;\r
1920     }\r
1921     private int findNextCompBoundary(CharSequence s, int p, int limit) {\r
1922         while(p<limit) {\r
1923             int c=Character.codePointAt(s, p);\r
1924             int norm16=normTrie.get(c);\r
1925             if(hasCompBoundaryBefore(c, norm16)) {\r
1926                 break;\r
1927             }\r
1928             p+=Character.charCount(c);\r
1929         }\r
1930         return p;\r
1931     }\r
1932 \r
1933     private int findPreviousFCDBoundary(CharSequence s, int p) {\r
1934         while(p>0) {\r
1935             int c=Character.codePointBefore(s, p);\r
1936             p-=Character.charCount(c);\r
1937             if(fcdTrie.get(c)<=0xff) {\r
1938                 break;\r
1939             }\r
1940         }\r
1941         return p;\r
1942     }\r
1943     private int findNextFCDBoundary(CharSequence s, int p, int limit) {\r
1944         while(p<limit) {\r
1945             int c=Character.codePointAt(s, p);\r
1946             int fcd16=fcdTrie.get(c);\r
1947             if(fcd16<=0xff) {\r
1948                 break;\r
1949             }\r
1950             p+=Character.charCount(c);\r
1951         }\r
1952         return p;\r
1953     }\r
1954 \r
1955     private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {\r
1956         int canonValue=newData.get(decompLead);\r
1957         if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {\r
1958             // origin is the first character whose decomposition starts with\r
1959             // the character for which we are setting the value.\r
1960             newData.set(decompLead, canonValue|origin);\r
1961         } else {\r
1962             // origin is not the first character, or it is U+0000.\r
1963             UnicodeSet set;\r
1964             if((canonValue&CANON_HAS_SET)==0) {\r
1965                 int firstOrigin=canonValue&CANON_VALUE_MASK;\r
1966                 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();\r
1967                 newData.set(decompLead, canonValue);\r
1968                 canonStartSets.add(set=new UnicodeSet());\r
1969                 if(firstOrigin!=0) {\r
1970                     set.add(firstOrigin);\r
1971                 }\r
1972             } else {\r
1973                 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);\r
1974             }\r
1975             set.add(origin);\r
1976         }\r
1977     }\r
1978 \r
1979     @SuppressWarnings("unused")\r
1980     private VersionInfo dataVersion;\r
1981 \r
1982     // Code point thresholds for quick check codes.\r
1983     private int minDecompNoCP;\r
1984     private int minCompNoMaybeCP;\r
1985 \r
1986     // Norm16 value thresholds for quick check combinations and types of extra data.\r
1987     private int minYesNo;\r
1988     private int minNoNo;\r
1989     private int limitNoNo;\r
1990     private int minMaybeYes;\r
1991 \r
1992     private Trie2_16 normTrie;\r
1993     private String maybeYesCompositions;\r
1994     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters\r
1995 \r
1996     private Trie2_16 fcdTrie;\r
1997     private Trie2_32 canonIterData;\r
1998     private ArrayList<UnicodeSet> canonStartSets;\r
1999 \r
2000     // bits in canonIterData\r
2001     private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;\r
2002     private static final int CANON_HAS_COMPOSITIONS = 0x40000000;\r
2003     private static final int CANON_HAS_SET = 0x200000;\r
2004     private static final int CANON_VALUE_MASK = 0x1fffff;\r
2005 }\r