2 *******************************************************************************
\r
3 * Copyright (C) 2009-2010, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.BufferedInputStream;
\r
10 import java.io.DataInputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.util.ArrayList;
\r
14 import java.util.Iterator;
\r
16 import com.ibm.icu.text.UTF16;
\r
17 import com.ibm.icu.text.UnicodeSet;
\r
18 import com.ibm.icu.util.VersionInfo;
\r
20 public final class Normalizer2Impl {
\r
21 public static final class Hangul {
\r
22 /* Korean Hangul and Jamo constants */
\r
23 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
\r
24 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
\r
25 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
\r
27 public static final int HANGUL_BASE=0xac00;
\r
29 public static final int JAMO_L_COUNT=19;
\r
30 public static final int JAMO_V_COUNT=21;
\r
31 public static final int JAMO_T_COUNT=28;
\r
33 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
\r
34 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
\r
36 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
\r
38 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
\r
39 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
\r
41 public static boolean isHangul(int c) {
\r
42 return HANGUL_BASE<=c && c<HANGUL_LIMIT;
\r
44 public static boolean isHangulWithoutJamoT(char c) {
\r
46 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
\r
48 public static boolean isJamoL(int c) {
\r
49 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
\r
51 public static boolean isJamoV(int c) {
\r
52 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
\r
56 * Decomposes c, which must be a Hangul syllable, into buffer
\r
57 * and returns the length of the decomposition (2 or 3).
\r
59 public static int decompose(int c, Appendable buffer) {
\r
62 int c2=c%JAMO_T_COUNT;
\r
64 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
\r
65 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
\r
69 buffer.append((char)(JAMO_T_BASE+c2));
\r
72 } catch(IOException e) {
\r
73 // Will not occur because we do not write to I/O.
\r
74 throw new RuntimeException(e);
\r
80 * Writable buffer that takes care of canonical ordering.
\r
81 * Its Appendable methods behave like the C++ implementation's
\r
82 * appendZeroCC() methods.
\r
84 * If dest is a StringBuilder, then the buffer writes directly to it.
\r
85 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
\r
86 * until no further changes are necessary and whole segments are appended.
\r
87 * append() methods that take combining-class values always write to the StringBuilder.
\r
88 * Other append() methods flush and append to the Appendable.
\r
90 public static final class ReorderingBuffer implements Appendable {
\r
91 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
\r
94 if(app instanceof StringBuilder) {
\r
95 appIsStringBuilder=true;
\r
96 str=(StringBuilder)dest;
\r
97 // In Java, the constructor subsumes public void init(int destCapacity) {
\r
98 str.ensureCapacity(destCapacity);
\r
100 if(str.length()==0) {
\r
104 lastCC=previousCC();
\r
105 // Set reorderStart after the last code point with cc<=1 if there is one.
\r
107 while(previousCC()>1) {}
\r
109 reorderStart=codePointLimit;
\r
112 appIsStringBuilder=false;
\r
113 str=new StringBuilder();
\r
119 public boolean isEmpty() { return str.length()==0; }
\r
120 public int length() { return str.length(); }
\r
121 public int getLastCC() { return lastCC; }
\r
123 public StringBuilder getStringBuilder() { return str; }
\r
125 public boolean equals(CharSequence s, int start, int limit) {
\r
126 return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
\r
129 // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
\r
130 public void setLastChar(char c) {
\r
131 str.setCharAt(str.length()-1, c);
\r
134 public void append(int c, int cc) {
\r
135 if(lastCC<=cc || cc==0) {
\r
136 str.appendCodePoint(c);
\r
139 reorderStart=str.length();
\r
145 // s must be in NFD, otherwise change the implementation.
\r
146 public void append(CharSequence s, int start, int limit,
\r
147 int leadCC, int trailCC) {
\r
151 if(lastCC<=leadCC || leadCC==0) {
\r
153 reorderStart=str.length()+(limit-start);
\r
154 } else if(leadCC<=1) {
\r
155 reorderStart=str.length()+1; // Ok if not a code point boundary.
\r
157 str.append(s, start, limit);
\r
160 int c=Character.codePointAt(s, start);
\r
161 start+=Character.charCount(c);
\r
162 insert(c, leadCC); // insert first code point
\r
163 while(start<limit) {
\r
164 c=Character.codePointAt(s, start);
\r
165 start+=Character.charCount(c);
\r
167 // s must be in NFD, otherwise we need to use getCC().
\r
168 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
\r
176 // The following append() methods work like C++ appendZeroCC().
\r
177 // They assume that the cc or trailCC of their input is 0.
\r
178 // Most of them implement Appendable interface methods.
\r
179 // @Override when we switch to Java 6
\r
180 public ReorderingBuffer append(char c) {
\r
183 reorderStart=str.length();
\r
186 public void appendZeroCC(int c) {
\r
187 str.appendCodePoint(c);
\r
189 reorderStart=str.length();
\r
191 // @Override when we switch to Java 6
\r
192 public ReorderingBuffer append(CharSequence s) {
\r
193 if(s.length()!=0) {
\r
196 reorderStart=str.length();
\r
200 // @Override when we switch to Java 6
\r
201 public ReorderingBuffer append(CharSequence s, int start, int limit) {
\r
203 str.append(s, start, limit);
\r
205 reorderStart=str.length();
\r
210 * Flushes from the intermediate StringBuilder to the Appendable,
\r
211 * if they are different objects.
\r
212 * Used after recomposition.
\r
213 * Must be called at the end when writing to a non-StringBuilder Appendable.
\r
215 public void flush() {
\r
216 if(appIsStringBuilder) {
\r
217 reorderStart=str.length();
\r
223 } catch(IOException e) {
\r
224 throw new RuntimeException(e); // Avoid declaring "throws IOException".
\r
230 * Flushes from the intermediate StringBuilder to the Appendable,
\r
231 * if they are different objects.
\r
232 * Then appends the new text to the Appendable or StringBuilder.
\r
233 * Normally used after quick check loops find a non-empty sequence.
\r
235 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
\r
236 if(appIsStringBuilder) {
\r
237 str.append(s, start, limit);
\r
238 reorderStart=str.length();
\r
241 app.append(str).append(s, start, limit);
\r
244 } catch(IOException e) {
\r
245 throw new RuntimeException(e); // Avoid declaring "throws IOException".
\r
251 public void remove() {
\r
256 public void removeSuffix(int suffixLength) {
\r
257 int oldLength=str.length();
\r
258 str.delete(oldLength-suffixLength, oldLength);
\r
260 reorderStart=str.length();
\r
264 * TODO: Revisit whether it makes sense to track reorderStart.
\r
265 * It is set to after the last known character with cc<=1,
\r
266 * which stops previousCC() before it reads that character and looks up its cc.
\r
267 * previousCC() is normally only called from insert().
\r
268 * In other words, reorderStart speeds up the insertion of a combining mark
\r
269 * into a multi-combining mark sequence where it does not belong at the end.
\r
270 * This might not be worth the trouble.
\r
271 * On the other hand, it's not a huge amount of trouble.
\r
273 * We probably need it for UNORM_SIMPLE_APPEND.
\r
276 // Inserts c somewhere before the last character.
\r
277 // Requires 0<cc<lastCC which implies reorderStart<limit.
\r
278 private void insert(int c, int cc) {
\r
279 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
\r
280 // insert c at codePointLimit, after the character with prevCC<=cc
\r
282 str.insert(codePointLimit, (char)c);
\r
284 reorderStart=codePointLimit+1;
\r
287 str.insert(codePointLimit, Character.toChars(c));
\r
289 reorderStart=codePointLimit+2;
\r
294 private final Normalizer2Impl impl;
\r
295 private final Appendable app;
\r
296 private final StringBuilder str;
\r
297 private final boolean appIsStringBuilder;
\r
298 private int reorderStart;
\r
299 private int lastCC;
\r
301 // private backward iterator
\r
302 private void setIterator() { codePointStart=str.length(); }
\r
303 private void skipPrevious() { // Requires 0<codePointStart.
\r
304 codePointLimit=codePointStart;
\r
305 codePointStart=str.offsetByCodePoints(codePointStart, -1);
\r
307 private int previousCC() { // Returns 0 if there is no previous character.
\r
308 codePointLimit=codePointStart;
\r
309 if(reorderStart>=codePointStart) {
\r
312 int c=str.codePointBefore(codePointStart);
\r
313 codePointStart-=Character.charCount(c);
\r
314 if(c<MIN_CCC_LCCC_CP) {
\r
317 return getCCFromYesOrMaybe(impl.getNorm16(c));
\r
320 private int codePointStart, codePointLimit;
\r
323 // TODO: Propose as public API on the UTF16 class.
\r
324 // TODO: Propose widening UTF16 methods that take char to take int.
\r
325 // TODO: Propose widening UTF16 methods that take String to take CharSequence.
\r
326 public static final class UTF16Plus {
\r
328 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
\r
329 * is it a lead surrogate?
\r
330 * @param c code unit or code point
\r
331 * @return true or false
\r
334 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
\r
336 * Compares two CharSequence objects for binary equality.
\r
337 * @param s1 first sequence
\r
338 * @param s2 second sequence
\r
339 * @return true if s1 contains the same text as s2
\r
342 public static boolean equal(CharSequence s1, CharSequence s2) {
\r
343 int length=s1.length();
\r
344 if(length!=s2.length()) {
\r
347 for(int i=0; i<length; ++i) {
\r
348 if(s1.charAt(i)!=s2.charAt(i)) {
\r
355 * Compares two CharSequence subsequences for binary equality.
\r
356 * @param s1 first sequence
\r
357 * @param start1 start offset in first sequence
\r
358 * @param limit1 limit offset in first sequence
\r
359 * @param s2 second sequence
\r
360 * @param start2 start offset in second sequence
\r
361 * @param limit2 limit offset in second sequence
\r
362 * @return true if s1.subSequence(start1, limit1) contains the same text
\r
363 * as s2.subSequence(start2, limit2)
\r
366 public static boolean equal(CharSequence s1, int start1, int limit1,
\r
367 CharSequence s2, int start2, int limit2) {
\r
368 if((limit1-start1)!=(limit2-start2)) {
\r
371 while(start1<limit1) {
\r
372 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
\r
380 public Normalizer2Impl() {}
\r
382 private static final class Reader implements ICUBinary.Authenticate {
\r
383 // @Override when we switch to Java 6
\r
384 public boolean isDataVersionAcceptable(byte version[]) {
\r
385 return version[0]==1;
\r
387 public VersionInfo readHeader(InputStream data) throws IOException {
\r
388 byte[] dataVersion=ICUBinary.readHeader(data, DATA_FORMAT, this);
\r
389 return VersionInfo.getInstance(dataVersion[0], dataVersion[1],
\r
390 dataVersion[2], dataVersion[3]);
\r
392 private static final byte DATA_FORMAT[] = { 0x4e, 0x72, 0x6d, 0x32 }; // "Nrm2"
\r
394 private static final Reader READER=new Reader();
\r
395 public Normalizer2Impl load(InputStream data) {
\r
397 BufferedInputStream bis=new BufferedInputStream(data);
\r
398 dataVersion=READER.readHeader(bis);
\r
399 DataInputStream ds=new DataInputStream(bis);
\r
400 int indexesLength=ds.readInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4
\r
401 if(indexesLength<=IX_MIN_MAYBE_YES) {
\r
402 throw new IOException("Normalizer2 data: not enough indexes");
\r
404 int[] inIndexes=new int[indexesLength];
\r
405 inIndexes[0]=indexesLength*4;
\r
406 for(int i=1; i<indexesLength; ++i) {
\r
407 inIndexes[i]=ds.readInt();
\r
410 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
\r
411 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
\r
413 minYesNo=inIndexes[IX_MIN_YES_NO];
\r
414 minNoNo=inIndexes[IX_MIN_NO_NO];
\r
415 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
\r
416 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
\r
418 // Read the normTrie.
\r
419 int offset=inIndexes[IX_NORM_TRIE_OFFSET];
\r
420 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
\r
421 normTrie=Trie2_16.createFromSerialized(ds);
\r
422 int trieLength=normTrie.getSerializedLength();
\r
423 if(trieLength>(nextOffset-offset)) {
\r
424 throw new IOException("Normalizer2 data: not enough bytes for normTrie");
\r
426 ds.skipBytes((nextOffset-offset)-trieLength); // skip padding after trie bytes
\r
428 // Read the composition and mapping data.
\r
430 nextOffset=inIndexes[IX_RESERVED2_OFFSET];
\r
431 int numChars=(nextOffset-offset)/2;
\r
434 chars=new char[numChars];
\r
435 for(int i=0; i<numChars; ++i) {
\r
436 chars[i]=ds.readChar();
\r
438 maybeYesCompositions=new String(chars);
\r
439 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
\r
443 } catch(IOException e) {
\r
444 throw new RuntimeException(e);
\r
447 public Normalizer2Impl load(String name) {
\r
448 return load(ICUData.getRequiredStream(name));
\r
451 public void addPropertyStarts(UnicodeSet set) {
\r
452 /* add the start code point of each same-value range of each trie */
\r
453 Iterator<Trie2.Range> trieIterator=normTrie.iterator();
\r
455 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
\r
456 /* add the start code point to the USet */
\r
457 set.add(range.startCodePoint);
\r
460 /* add Hangul LV syllables and LV+1 because of skippables */
\r
461 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
\r
465 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
\r
468 public void addCanonIterPropertyStarts(UnicodeSet set) {
\r
469 /* add the start code point of each same-value range of the canonical iterator data trie */
\r
470 ensureCanonIterData();
\r
471 // currently only used for the SEGMENT_STARTER property
\r
472 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
\r
474 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
\r
475 /* add the start code point to the USet */
\r
476 set.add(range.startCodePoint);
\r
479 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
\r
480 public int map(int in) {
\r
481 return in&CANON_NOT_SEGMENT_STARTER;
\r
485 // low-level properties ------------------------------------------------ ***
\r
487 public Trie2_16 getNormTrie() { return normTrie; }
\r
488 public synchronized Trie2_16 getFCDTrie() {
\r
489 if(fcdTrie!=null) {
\r
492 Trie2Writable newFCDTrie=new Trie2Writable(0, 0);
\r
493 Iterator<Trie2.Range> trieIterator=normTrie.iterator();
\r
495 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
\r
496 // Set the FCD value for a range of same-norm16 characters.
\r
497 if(range.value!=0) {
\r
498 setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);
\r
501 for(char lead=0xd800; lead<0xdc00; ++lead) {
\r
502 // Collect (OR together) the FCD values for a range of supplementary characters,
\r
503 // for their lead surrogate code unit.
\r
504 int oredValue=newFCDTrie.get(lead);
\r
505 trieIterator=normTrie.iteratorForLeadSurrogate(lead);
\r
506 while(trieIterator.hasNext()) {
\r
507 oredValue|=trieIterator.next().value;
\r
510 // Set a "bad" value for makeFCD() to break the quick check loop
\r
511 // and look up the value for the supplementary code point.
\r
512 // If there is any lccc, then set the worst-case lccc of 1.
\r
513 // The ORed-together value's tccc is already the worst case.
\r
514 if(oredValue>0xff) {
\r
515 oredValue=0x100|(oredValue&0xff);
\r
517 newFCDTrie.setForLeadSurrogateCodeUnit(lead, oredValue);
\r
520 return fcdTrie=newFCDTrie.toTrie2_16();
\r
523 public synchronized Normalizer2Impl ensureCanonIterData() {
\r
524 if(canonIterData==null) {
\r
525 Trie2Writable newData=new Trie2Writable(0, 0);
\r
526 canonStartSets=new ArrayList<UnicodeSet>();
\r
527 Iterator<Trie2.Range> trieIterator=normTrie.iterator();
\r
529 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
\r
530 final int norm16=range.value;
\r
531 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
\r
532 // Inert, or 2-way mapping (including Hangul syllable).
\r
533 // We do not write a canonStartSet for any yesNo character.
\r
534 // Composites from 2-way mappings are added at runtime from the
\r
535 // starter's compositions list, and the other characters in
\r
536 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
\r
537 // "maybe" characters.
\r
540 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
\r
541 final int oldValue=newData.get(c);
\r
542 int newValue=oldValue;
\r
543 if(norm16>=minMaybeYes) {
\r
544 // not a segment starter if it occurs in a decomposition or has cc!=0
\r
545 newValue|=CANON_NOT_SEGMENT_STARTER;
\r
546 if(norm16<MIN_NORMAL_MAYBE_YES) {
\r
547 newValue|=CANON_HAS_COMPOSITIONS;
\r
549 } else if(norm16<minYesNo) {
\r
550 newValue|=CANON_HAS_COMPOSITIONS;
\r
552 // c has a one-way decomposition
\r
554 int norm16_2=norm16;
\r
555 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
\r
556 c2=this.mapAlgorithmic(c2, norm16_2);
\r
557 norm16_2=getNorm16(c2);
\r
559 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
\r
560 // c decomposes, get everything from the variable-length extra data
\r
561 int firstUnit=extraData.charAt(norm16_2++);
\r
562 int length=firstUnit&MAPPING_LENGTH_MASK;
\r
563 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
\r
564 if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {
\r
565 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
\r
569 // Skip empty mappings (no characters in the decomposition).
\r
571 // add c to first code point's start set
\r
572 int limit=norm16_2+length;
\r
573 c2=extraData.codePointAt(norm16_2);
\r
574 addToStartSet(newData, c, c2);
\r
575 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
\r
576 // one-way mapping. A 2-way mapping is possible here after
\r
577 // intermediate algorithmic mapping.
\r
578 if(norm16_2>=minNoNo) {
\r
579 while((norm16_2+=Character.charCount(c2))<limit) {
\r
580 c2=extraData.codePointAt(norm16_2);
\r
581 int c2Value=newData.get(c2);
\r
582 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
\r
583 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
\r
589 // c decomposed to c2 algorithmically; c has cc==0
\r
590 addToStartSet(newData, c, c2);
\r
593 if(newValue!=oldValue) {
\r
594 newData.set(c, newValue);
\r
598 canonIterData=newData.toTrie2_32();
\r
603 public int getNorm16(int c) { return normTrie.get(c); }
\r
605 public int getCompQuickCheck(int norm16) {
\r
606 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
\r
608 } else if(minMaybeYes<=norm16) {
\r
614 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
\r
615 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
\r
617 public int getCC(int norm16) {
\r
618 if(norm16>=MIN_NORMAL_MAYBE_YES) {
\r
619 return norm16&0xff;
\r
621 if(norm16<minNoNo || limitNoNo<=norm16) {
\r
624 return getCCFromNoNo(norm16);
\r
626 public static int getCCFromYesOrMaybe(int norm16) {
\r
627 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
\r
630 public int getFCD16(int c) { return fcdTrie.get(c); }
\r
631 public int getFCD16FromSingleLead(char c) { return fcdTrie.getFromU16SingleLead(c); }
\r
633 void setFCD16FromNorm16(int start, int end, int norm16, Trie2Writable newFCDTrie) {
\r
634 // Only loops for 1:1 algorithmic mappings.
\r
636 if(norm16>=MIN_NORMAL_MAYBE_YES) {
\r
639 } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
\r
640 // no decomposition or Hangul syllable, all zeros
\r
642 } else if(limitNoNo<=norm16) {
\r
643 int delta=norm16-(minMaybeYes-MAX_DELTA-1);
\r
646 norm16=getNorm16(start);
\r
648 // the same delta leads from different original characters to different mappings
\r
651 setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie);
\r
652 } while(++start<=end);
\r
656 // c decomposes, get everything from the variable-length extra data
\r
657 int firstUnit=extraData.charAt(norm16);
\r
658 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
\r
659 // A character that is deleted (maps to an empty string) must
\r
660 // get the worst-case lccc and tccc values because arbitrary
\r
661 // characters on both sides will become adjacent.
\r
664 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
\r
665 norm16=extraData.charAt(norm16+1)&0xff00; // lccc
\r
669 norm16|=firstUnit>>8; // tccc
\r
672 newFCDTrie.setRange(start, end, norm16, true);
\r
678 * Get the decomposition for one code point.
\r
679 * @param c code point
\r
680 * @return c's decomposition, if it has one; returns null if it does not have a decomposition
\r
682 public String getDecomposition(int c) {
\r
686 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
\r
687 // c does not decompose
\r
688 } else if(isHangul(norm16)) {
\r
689 // Hangul syllable: decompose algorithmically
\r
690 StringBuilder buffer=new StringBuilder();
\r
691 Hangul.decompose(c, buffer);
\r
692 return buffer.toString();
\r
693 } else if(isDecompNoAlgorithmic(norm16)) {
\r
694 decomp=c=mapAlgorithmic(c, norm16);
\r
697 // c decomposes, get everything from the variable-length extra data
\r
698 int firstUnit=extraData.charAt(norm16++);
\r
699 int length=firstUnit&MAPPING_LENGTH_MASK;
\r
700 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
\r
703 return extraData.substring(norm16, norm16+length);
\r
708 return UTF16.valueOf(decomp);
\r
713 public boolean isCanonSegmentStarter(int c) {
\r
714 return canonIterData.get(c)>=0;
\r
716 public boolean getCanonStartSet(int c, UnicodeSet set) {
\r
717 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
\r
718 if(canonValue==0) {
\r
722 int value=canonValue&CANON_VALUE_MASK;
\r
723 if((canonValue&CANON_HAS_SET)!=0) {
\r
724 set.addAll(canonStartSets.get(value));
\r
725 } else if(value!=0) {
\r
728 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
\r
729 int norm16=getNorm16(c);
\r
730 if(norm16==JAMO_L) {
\r
731 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
\r
732 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
\r
734 addComposites(getCompositionsList(norm16), set);
\r
740 public static final int MIN_CCC_LCCC_CP=0x300;
\r
742 public static final int MIN_YES_YES_WITH_CC=0xff01;
\r
743 public static final int JAMO_VT=0xff00;
\r
744 public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
\r
745 public static final int JAMO_L=1;
\r
746 public static final int MAX_DELTA=0x40;
\r
748 // Byte offsets from the start of the data, after the generic header.
\r
749 public static final int IX_NORM_TRIE_OFFSET=0;
\r
750 public static final int IX_EXTRA_DATA_OFFSET=1;
\r
751 public static final int IX_RESERVED2_OFFSET=2;
\r
752 public static final int IX_TOTAL_SIZE=7;
\r
754 // Code point thresholds for quick check codes.
\r
755 public static final int IX_MIN_DECOMP_NO_CP=8;
\r
756 public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
\r
758 // Norm16 value thresholds for quick check combinations and types of extra data.
\r
759 public static final int IX_MIN_YES_NO=10;
\r
760 public static final int IX_MIN_NO_NO=11;
\r
761 public static final int IX_LIMIT_NO_NO=12;
\r
762 public static final int IX_MIN_MAYBE_YES=13;
\r
764 public static final int IX_COUNT=16;
\r
766 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
\r
767 public static final int MAPPING_PLUS_COMPOSITION_LIST=0x40;
\r
768 public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
\r
769 public static final int MAPPING_LENGTH_MASK=0x1f;
\r
771 public static final int COMP_1_LAST_TUPLE=0x8000;
\r
772 public static final int COMP_1_TRIPLE=1;
\r
773 public static final int COMP_1_TRAIL_LIMIT=0x3400;
\r
774 public static final int COMP_1_TRAIL_MASK=0x7ffe;
\r
775 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit
\r
776 public static final int COMP_2_TRAIL_SHIFT=6;
\r
777 public static final int COMP_2_TRAIL_MASK=0xffc0;
\r
779 // higher-level functionality ------------------------------------------ ***
\r
781 // Dual functionality:
\r
782 // buffer!=NULL: normalize
\r
783 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
\r
784 public int decompose(CharSequence s, int src, int limit,
\r
785 ReorderingBuffer buffer) {
\r
786 int minNoCP=minDecompNoCP;
\r
792 // only for quick check
\r
793 int prevBoundary=src;
\r
797 // count code units below the minimum or with irrelevant data for the quick check
\r
798 for(prevSrc=src; src!=limit;) {
\r
799 if( (c=s.charAt(src))<minNoCP ||
\r
800 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
\r
803 } else if(!UTF16.isSurrogate((char)c)) {
\r
807 if(UTF16Plus.isSurrogateLead(c)) {
\r
808 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
\r
809 c=Character.toCodePoint((char)c, c2);
\r
811 } else /* trail surrogate */ {
\r
812 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
\r
814 c=Character.toCodePoint(c2, (char)c);
\r
817 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
\r
818 src+=Character.charCount(c);
\r
824 // copy these code units all at once
\r
827 buffer.flushAndAppendZeroCC(s, prevSrc, src);
\r
837 // Check one above-minimum, relevant code point.
\r
838 src+=Character.charCount(c);
\r
840 decompose(c, norm16, buffer);
\r
842 if(isDecompYes(norm16)) {
\r
843 int cc=getCCFromYesOrMaybe(norm16);
\r
844 if(prevCC<=cc || cc==0) {
\r
852 return prevBoundary; // "no" or cc out of order
\r
857 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
\r
858 int limit=s.length();
\r
863 decompose(s, 0, limit, buffer);
\r
866 // Just merge the strings at the boundary.
\r
867 int c=Character.codePointAt(s, 0);
\r
869 int firstCC, prevCC, cc;
\r
870 firstCC=prevCC=cc=getCC(getNorm16(c));
\r
873 src+=Character.charCount(c);
\r
877 c=Character.codePointAt(s, src);
\r
878 cc=getCC(getNorm16(c));
\r
880 buffer.append(s, 0, src, firstCC, prevCC);
\r
881 buffer.append(s, src, limit);
\r
883 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
\r
884 // doCompose: normalize
\r
885 // !doCompose: isNormalized (buffer must be empty and initialized)
\r
886 public boolean compose(CharSequence s, int src, int limit,
\r
887 boolean onlyContiguous,
\r
889 ReorderingBuffer buffer) {
\r
890 int minNoMaybeCP=minCompNoMaybeCP;
\r
893 * prevBoundary points to the last character before the current one
\r
894 * that has a composition boundary before it with ccc==0 and quick check "yes".
\r
895 * Keeping track of prevBoundary saves us looking for a composition boundary
\r
896 * when we find a "no" or "maybe".
\r
898 * When we back out from prevSrc back to prevBoundary,
\r
899 * then we also remove those same characters (which had been simply copied
\r
900 * or canonically-order-inserted) from the ReorderingBuffer.
\r
901 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
\r
902 * must correspond 1:1 to destination units at the end of the destination buffer.
\r
904 int prevBoundary=src;
\r
909 // only for isNormalized
\r
913 // count code units below the minimum or with irrelevant data for the quick check
\r
914 for(prevSrc=src; src!=limit;) {
\r
915 if( (c=s.charAt(src))<minNoMaybeCP ||
\r
916 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
\r
919 } else if(!UTF16.isSurrogate((char)c)) {
\r
923 if(UTF16Plus.isSurrogateLead(c)) {
\r
924 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
\r
925 c=Character.toCodePoint((char)c, c2);
\r
927 } else /* trail surrogate */ {
\r
928 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
\r
930 c=Character.toCodePoint(c2, (char)c);
\r
933 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
\r
934 src+=Character.charCount(c);
\r
940 // copy these code units all at once
\r
944 buffer.flushAndAppendZeroCC(s, prevSrc, src);
\r
948 // Set prevBoundary to the last character in the quick check loop.
\r
949 prevBoundary=src-1;
\r
950 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
\r
951 Character.isHighSurrogate(s.charAt(prevBoundary-1))
\r
956 // The last "quick check yes" character is excluded from the
\r
957 // flush-and-append call in case it needs to be modified.
\r
958 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
\r
959 buffer.append(s, prevBoundary, src);
\r
963 // The start of the current character (c).
\r
965 } else if(src==limit) {
\r
969 src+=Character.charCount(c);
\r
971 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
\r
972 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
\r
974 * Check for Jamo V/T, then for regular characters.
\r
975 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
\r
977 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
\r
978 char prev=s.charAt(prevSrc-1);
\r
979 boolean needToDecompose=false;
\r
980 if(c<Hangul.JAMO_T_BASE) {
\r
981 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
\r
982 prev-=Hangul.JAMO_L_BASE;
\r
983 if(prev<Hangul.JAMO_L_COUNT) {
\r
987 char syllable=(char)
\r
988 (Hangul.HANGUL_BASE+
\r
989 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
\r
990 Hangul.JAMO_T_COUNT);
\r
992 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
\r
994 syllable+=t; // The next character was a Jamo T.
\r
996 buffer.setLastChar(syllable);
\r
999 // If we see L+V+x where x!=T then we drop to the slow path,
\r
1000 // decompose and recompose.
\r
1001 // This is to deal with NFKC finding normal L and V but a
\r
1002 // compatibility variant of a T. We need to either fully compose that
\r
1003 // combination here (which would complicate the code and may not work
\r
1004 // with strange custom data) or use the slow path -- or else our replacing
\r
1005 // two input characters (L+V) with one output character (LV syllable)
\r
1006 // would violate the invariant that [prevBoundary..prevSrc[ has the same
\r
1007 // length as what we appended to the buffer since prevBoundary.
\r
1008 needToDecompose=true;
\r
1010 } else if(Hangul.isHangulWithoutJamoT(prev)) {
\r
1011 // c is a Jamo Trailing consonant,
\r
1012 // compose with previous Hangul LV that does not contain a Jamo T.
\r
1016 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
\r
1020 if(!needToDecompose) {
\r
1021 // The Jamo V/T did not compose into a Hangul syllable.
\r
1023 buffer.append((char)c);
\r
1031 * Source buffer pointers:
\r
1033 * all done quick check current char not yet
\r
1034 * "yes" but (c) processed
\r
1037 * [-------------[-------------[-------------[-------------[
\r
1039 * orig. src prevBoundary prevSrc src limit
\r
1042 * Destination buffer pointers inside the ReorderingBuffer:
\r
1044 * all done might take not filled yet
\r
1047 * [-------------[-------------[-------------[
\r
1049 * start reorderStart limit |
\r
1052 if(norm16>=MIN_YES_YES_WITH_CC) {
\r
1053 int cc=norm16&0xff; // cc!=0
\r
1054 if( onlyContiguous && // FCC
\r
1055 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
\r
1056 prevBoundary<prevSrc &&
\r
1057 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
\r
1058 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
\r
1059 // passed the quick check "yes && ccc==0" test.
\r
1060 // Check whether the last character was a "yesYes" or a "yesNo".
\r
1061 // If a "yesNo", then we get its trailing ccc from its
\r
1062 // mapping and check for canonical order.
\r
1063 // All other cases are ok.
\r
1064 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
\r
1066 // Fails FCD test, need to decompose and contiguously recompose.
\r
1070 } else if(doCompose) {
\r
1071 buffer.append(c, cc);
\r
1073 } else if(prevCC<=cc) {
\r
1079 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
\r
1084 * Find appropriate boundaries around this character,
\r
1085 * decompose the source text from between the boundaries,
\r
1086 * and recompose it.
\r
1088 * We may need to remove the last few characters from the ReorderingBuffer
\r
1089 * to account for source text that was copied or appended
\r
1090 * but needs to take part in the recomposition.
\r
1094 * Find the last composition boundary in [prevBoundary..src[.
\r
1095 * It is either the decomposition of the current character (at prevSrc),
\r
1096 * or prevBoundary.
\r
1098 if(hasCompBoundaryBefore(c, norm16)) {
\r
1099 prevBoundary=prevSrc;
\r
1100 } else if(doCompose) {
\r
1101 buffer.removeSuffix(prevSrc-prevBoundary);
\r
1104 // Find the next composition boundary in [src..limit[ -
\r
1105 // modifies src to point to the next starter.
\r
1106 src=findNextCompBoundary(s, src, limit);
\r
1108 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
\r
1109 int recomposeStartIndex=buffer.length();
\r
1110 decomposeShort(s, prevBoundary, src, buffer);
\r
1111 recompose(buffer, recomposeStartIndex, onlyContiguous);
\r
1113 if(!buffer.equals(s, prevBoundary, src)) {
\r
1120 // Move to the next starter. We never need to look back before this point again.
\r
1126 * Very similar to compose(): Make the same changes in both places if relevant.
\r
1127 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
\r
1128 * !doSpan: quickCheck
\r
1129 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
\r
1130 * bit 0: set if "maybe"; otherwise, if the span length<s.length()
\r
1131 * then the quick check result is "no"
\r
1133 public int composeQuickCheck(CharSequence s, int src, int limit,
\r
1134 boolean onlyContiguous, boolean doSpan) {
\r
1136 int minNoMaybeCP=minCompNoMaybeCP;
\r
1139 * prevBoundary points to the last character before the current one
\r
1140 * that has a composition boundary before it with ccc==0 and quick check "yes".
\r
1142 int prevBoundary=src;
\r
1149 // count code units below the minimum or with irrelevant data for the quick check
\r
1150 for(prevSrc=src;;) {
\r
1152 return (src<<1)|qcResult; // "yes" or "maybe"
\r
1154 if( (c=s.charAt(src))<minNoMaybeCP ||
\r
1155 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
\r
1158 } else if(!UTF16.isSurrogate((char)c)) {
\r
1162 if(UTF16Plus.isSurrogateLead(c)) {
\r
1163 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
\r
1164 c=Character.toCodePoint((char)c, c2);
\r
1166 } else /* trail surrogate */ {
\r
1167 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
\r
1169 c=Character.toCodePoint(c2, (char)c);
\r
1172 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
\r
1173 src+=Character.charCount(c);
\r
1179 if(src!=prevSrc) {
\r
1180 // Set prevBoundary to the last character in the quick check loop.
\r
1181 prevBoundary=src-1;
\r
1182 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
\r
1183 Character.isHighSurrogate(s.charAt(prevBoundary-1))
\r
1188 // The start of the current character (c).
\r
1192 src+=Character.charCount(c);
\r
1194 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
\r
1195 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
\r
1198 if(isMaybeOrNonZeroCC(norm16)) {
\r
1199 int cc=getCCFromYesOrMaybe(norm16);
\r
1200 if( onlyContiguous && // FCC
\r
1203 prevBoundary<prevSrc &&
\r
1204 // prevCC==0 && prevBoundary<prevSrc tell us that
\r
1205 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
\r
1206 // passed the quick check "yes && ccc==0" test.
\r
1207 // Check whether the last character was a "yesYes" or a "yesNo".
\r
1208 // If a "yesNo", then we get its trailing ccc from its
\r
1209 // mapping and check for canonical order.
\r
1210 // All other cases are ok.
\r
1211 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
\r
1213 // Fails FCD test.
\r
1214 } else if(prevCC<=cc || cc==0) {
\r
1216 if(norm16<MIN_YES_YES_WITH_CC) {
\r
1220 return prevBoundary<<1; // spanYes does not care to know it's "maybe"
\r
1226 return prevBoundary<<1; // "no"
\r
1229 public void composeAndAppend(CharSequence s,
\r
1230 boolean doCompose,
\r
1231 boolean onlyContiguous,
\r
1232 ReorderingBuffer buffer) {
\r
1233 int src=0, limit=s.length();
\r
1234 if(!buffer.isEmpty()) {
\r
1235 int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
\r
1236 if(0!=firstStarterInSrc) {
\r
1237 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
\r
1239 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
\r
1240 firstStarterInSrc+16);
\r
1241 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
\r
1242 buffer.removeSuffix(buffer.length()-lastStarterInDest);
\r
1243 middle.append(s, 0, firstStarterInSrc);
\r
1244 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
\r
1245 src=firstStarterInSrc;
\r
1249 compose(s, src, limit, onlyContiguous, true, buffer);
\r
1251 buffer.append(s, src, limit);
\r
1254 // Dual functionality:
\r
1255 // buffer!=NULL: normalize
\r
1256 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
\r
1257 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
\r
1258 // Note: In this function we use buffer->appendZeroCC() because we track
\r
1259 // the lead and trail combining classes here, rather than leaving it to
\r
1260 // the ReorderingBuffer.
\r
1261 // The exception is the call to decomposeShort() which uses the buffer
\r
1262 // in the normal way.
\r
1264 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
\r
1265 // Similar to the prevBoundary in the compose() implementation.
\r
1266 int prevBoundary=src;
\r
1273 // count code units with lccc==0
\r
1274 for(prevSrc=src; src!=limit;) {
\r
1275 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
\r
1278 } else if((fcd16=fcdTrie.getFromU16SingleLead((char)c))<=0xff) {
\r
1281 } else if(!UTF16.isSurrogate((char)c)) {
\r
1285 if(UTF16Plus.isSurrogateLead(c)) {
\r
1286 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
\r
1287 c=Character.toCodePoint((char)c, c2);
\r
1289 } else /* trail surrogate */ {
\r
1290 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
\r
1292 c=Character.toCodePoint(c2, (char)c);
\r
1295 if((fcd16=getFCD16(c))<=0xff) {
\r
1297 src+=Character.charCount(c);
\r
1303 // copy these code units all at once
\r
1304 if(src!=prevSrc) {
\r
1306 if(buffer!=null) {
\r
1307 buffer.flushAndAppendZeroCC(s, prevSrc, src);
\r
1312 // We know that the previous character's lccc==0.
\r
1314 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
\r
1315 prevFCD16=getFCD16FromSingleLead((char)~prevFCD16);
\r
1321 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
\r
1322 Character.isHighSurrogate(s.charAt(p-1))
\r
1325 // Need to fetch the previous character's FCD value because
\r
1326 // prevFCD16 was just for the trail surrogate code point.
\r
1327 prevFCD16=getFCD16(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
\r
1328 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
\r
1334 if(buffer!=null) {
\r
1335 // The last lccc==0 character is excluded from the
\r
1336 // flush-and-append call in case it needs to be modified.
\r
1337 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
\r
1338 buffer.append(s, prevBoundary, src);
\r
1340 // The start of the current character (c).
\r
1342 } else if(src==limit) {
\r
1346 src+=Character.charCount(c);
\r
1347 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
\r
1348 // Check for proper order, and decompose locally if necessary.
\r
1349 if((prevFCD16&0xff)<=(fcd16>>8)) {
\r
1350 // proper order: prev tccc <= current lccc
\r
1351 if((fcd16&0xff)<=1) {
\r
1354 if(buffer!=null) {
\r
1355 buffer.appendZeroCC(c);
\r
1359 } else if(buffer==null) {
\r
1360 return prevBoundary; // quick check "no"
\r
1363 * Back out the part of the source that we copied or appended
\r
1364 * already but is now going to be decomposed.
\r
1365 * prevSrc is set to after what was copied/appended.
\r
1367 buffer.removeSuffix(prevSrc-prevBoundary);
\r
1369 * Find the part of the source that needs to be decomposed,
\r
1370 * up to the next safe boundary.
\r
1372 src=findNextFCDBoundary(s, src, limit);
\r
1374 * The source text does not fulfill the conditions for FCD.
\r
1375 * Decompose and reorder a limited piece of the text.
\r
1377 decomposeShort(s, prevBoundary, src, buffer);
\r
1384 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
\r
1385 int src=0, limit=s.length();
\r
1386 if(!buffer.isEmpty()) {
\r
1387 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
\r
1388 if(0!=firstBoundaryInSrc) {
\r
1389 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
\r
1391 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
\r
1392 firstBoundaryInSrc+16);
\r
1393 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
\r
1394 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
\r
1395 middle.append(s, 0, firstBoundaryInSrc);
\r
1396 makeFCD(middle, 0, middle.length(), buffer);
\r
1397 src=firstBoundaryInSrc;
\r
1401 makeFCD(s, src, limit, buffer);
\r
1403 buffer.append(s, src, limit);
\r
1407 // Note: hasDecompBoundary() could be implemented as aliases to
\r
1408 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
\r
1409 // at the cost of building the FCD trie for a decomposition normalizer.
\r
1410 public boolean hasDecompBoundary(int c, boolean before) {
\r
1412 if(c<minDecompNoCP) {
\r
1415 int norm16=getNorm16(c);
\r
1416 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
\r
1418 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
\r
1419 return false; // ccc!=0
\r
1420 } else if(isDecompNoAlgorithmic(norm16)) {
\r
1421 c=mapAlgorithmic(c, norm16);
\r
1423 // c decomposes, get everything from the variable-length extra data
\r
1424 int firstUnit=extraData.charAt(norm16++);
\r
1425 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
\r
1429 // decomp after-boundary: same as hasFCDBoundaryAfter(),
\r
1430 // fcd16<=1 || trailCC==0
\r
1431 if(firstUnit>0x1ff) {
\r
1432 return false; // trailCC>1
\r
1434 if(firstUnit<=0xff) {
\r
1435 return true; // trailCC==0
\r
1437 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
\r
1439 // true if leadCC==0 (hasFCDBoundaryBefore())
\r
1440 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16)&0xff00)==0;
\r
1444 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
\r
1446 public boolean hasCompBoundaryBefore(int c) {
\r
1447 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
\r
1449 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) {
\r
1451 int norm16=getNorm16(c);
\r
1452 if(isInert(norm16)) {
\r
1454 } else if(norm16<=minYesNo) {
\r
1455 // Hangul LVT (==minYesNo) has a boundary after it.
\r
1456 // Hangul LV and non-inert yesYes characters combine forward.
\r
1457 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);
\r
1458 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
\r
1460 } else if(isDecompNoAlgorithmic(norm16)) {
\r
1461 c=mapAlgorithmic(c, norm16);
\r
1463 // c decomposes, get everything from the variable-length extra data.
\r
1464 // If testInert, then c must be a yesNo character which has lccc=0,
\r
1465 // otherwise it could be a noNo.
\r
1466 int firstUnit=extraData.charAt(norm16);
\r
1468 // c is not deleted, and
\r
1469 // it and its decomposition do not combine forward, and it has a starter, and
\r
1470 // if FCC then trailCC<=1
\r
1472 (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
\r
1473 (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
\r
1474 (!onlyContiguous || firstUnit<=0x1ff);
\r
1479 public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
\r
1480 public boolean hasFCDBoundaryAfter(int c) {
\r
1481 int fcd16=getFCD16(c);
\r
1482 return fcd16<=1 || (fcd16&0xff)==0;
\r
1484 public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
\r
1486 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
\r
1487 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
\r
1488 private static boolean isInert(int norm16) { return norm16==0; }
\r
1489 // static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
\r
1490 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
\r
1491 private boolean isHangul(int norm16) { return norm16==minYesNo; }
\r
1492 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
\r
1493 // UBool isCompYes(uint16_t norm16) const {
\r
1494 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
\r
1496 // UBool isCompYesOrMaybe(uint16_t norm16) const {
\r
1497 // return norm16<minNoNo || minMaybeYes<=norm16;
\r
1499 // private boolean hasZeroCCFromDecompYes(int norm16) {
\r
1500 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
\r
1502 private boolean isDecompYesAndZeroCC(int norm16) {
\r
1503 return norm16<minYesNo ||
\r
1504 norm16==JAMO_VT ||
\r
1505 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
\r
1508 * A little faster and simpler than isDecompYesAndZeroCC() but does not include
\r
1509 * the MaybeYes which combine-forward and have ccc=0.
\r
1510 * (Standard Unicode 5.2 normalization does not have such characters.)
\r
1512 private boolean isMostDecompYesAndZeroCC(int norm16) {
\r
1513 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
\r
1515 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
\r
1517 // For use with isCompYes().
\r
1518 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
\r
1519 // static uint8_t getCCFromYes(uint16_t norm16) {
\r
1520 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
\r
1522 private int getCCFromNoNo(int norm16) {
\r
1523 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
\r
1524 return extraData.charAt(norm16+1)&0xff;
\r
1529 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
\r
1530 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
\r
1532 if(cpStart==(cpLimit-1)) {
\r
1533 c=s.charAt(cpStart);
\r
1535 c=Character.codePointAt(s, cpStart);
\r
1537 int prevNorm16=getNorm16(c);
\r
1538 if(prevNorm16<=minYesNo) {
\r
1539 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
\r
1541 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo
\r
1545 // Requires algorithmic-NoNo.
\r
1546 private int mapAlgorithmic(int c, int norm16) {
\r
1547 return c+norm16-(minMaybeYes-MAX_DELTA-1);
\r
1550 // Requires minYesNo<norm16<limitNoNo.
\r
1551 // private int getMapping(int norm16) { return /*extraData+*/norm16; }
\r
1554 * @return index into maybeYesCompositions, or -1
\r
1556 private int getCompositionsListForDecompYes(int norm16) {
\r
1557 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
\r
1560 if((norm16-=minMaybeYes)<0) {
\r
1561 // norm16<minMaybeYes: index into extraData which is a substring at
\r
1562 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
\r
1563 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
\r
1564 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list
\r
1570 * @return index into maybeYesCompositions
\r
1572 private int getCompositionsListForComposite(int norm16) {
\r
1573 // composite has both mapping & compositions list
\r
1574 int firstUnit=extraData.charAt(norm16);
\r
1575 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions
\r
1576 1+ // +1 to skip the first unit with the mapping lenth
\r
1577 (firstUnit&MAPPING_LENGTH_MASK)+ // + mapping length
\r
1578 ((firstUnit>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
\r
1581 * @param c code point must have compositions
\r
1582 * @return index into maybeYesCompositions
\r
1584 private int getCompositionsList(int norm16) {
\r
1585 return isDecompYes(norm16) ?
\r
1586 getCompositionsListForDecompYes(norm16) :
\r
1587 getCompositionsListForComposite(norm16);
\r
1590 // Decompose a short piece of text which is likely to contain characters that
\r
1591 // fail the quick check loop and/or where the quick check loop's overhead
\r
1592 // is unlikely to be amortized.
\r
1593 // Called by the compose() and makeFCD() implementations.
\r
1594 // Public in Java for collation implementation code.
\r
1595 public void decomposeShort(CharSequence s, int src, int limit,
\r
1596 ReorderingBuffer buffer) {
\r
1597 while(src<limit) {
\r
1598 int c=Character.codePointAt(s, src);
\r
1599 src+=Character.charCount(c);
\r
1600 decompose(c, getNorm16(c), buffer);
\r
1603 private void decompose(int c, int norm16,
\r
1604 ReorderingBuffer buffer) {
\r
1605 // Only loops for 1:1 algorithmic mappings.
\r
1607 // get the decomposition and the lead and trail cc's
\r
1608 if(isDecompYes(norm16)) {
\r
1609 // c does not decompose
\r
1610 buffer.append(c, getCCFromYesOrMaybe(norm16));
\r
1611 } else if(isHangul(norm16)) {
\r
1612 // Hangul syllable: decompose algorithmically
\r
1613 Hangul.decompose(c, buffer);
\r
1614 } else if(isDecompNoAlgorithmic(norm16)) {
\r
1615 c=mapAlgorithmic(c, norm16);
\r
1616 norm16=getNorm16(c);
\r
1619 // c decomposes, get everything from the variable-length extra data
\r
1620 int firstUnit=extraData.charAt(norm16++);
\r
1621 int length=firstUnit&MAPPING_LENGTH_MASK;
\r
1622 int leadCC, trailCC;
\r
1623 trailCC=firstUnit>>8;
\r
1624 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
\r
1625 leadCC=extraData.charAt(norm16++)>>8;
\r
1629 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
\r
1636 * Finds the recomposition result for
\r
1637 * a forward-combining "lead" character,
\r
1638 * specified with a pointer to its compositions list,
\r
1639 * and a backward-combining "trail" character.
\r
1641 * If the lead and trail characters combine, then this function returns
\r
1642 * the following "compositeAndFwd" value:
\r
1643 * Bits 21..1 composite character
\r
1644 * Bit 0 set if the composite is a forward-combining starter
\r
1645 * otherwise it returns -1.
\r
1647 * The compositions list has (trail, compositeAndFwd) pair entries,
\r
1648 * encoded as either pairs or triples of 16-bit units.
\r
1649 * The last entry has the high bit of its first unit set.
\r
1651 * The list is sorted by ascending trail characters (there are no duplicates).
\r
1652 * A linear search is used.
\r
1654 * See normalizer2impl.h for a more detailed description
\r
1655 * of the compositions list format.
\r
1657 private static int combine(String compositions, int list, int trail) {
\r
1658 int key1, firstUnit;
\r
1659 if(trail<COMP_1_TRAIL_LIMIT) {
\r
1660 // trail character is 0..33FF
\r
1661 // result entry may have 2 or 3 units
\r
1663 while(key1>(firstUnit=compositions.charAt(list))) {
\r
1664 list+=2+(firstUnit&COMP_1_TRIPLE);
\r
1666 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
\r
1667 if((firstUnit&COMP_1_TRIPLE)!=0) {
\r
1668 return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
\r
1670 return compositions.charAt(list+1);
\r
1674 // trail character is 3400..10FFFF
\r
1675 // result entry has 3 units
\r
1676 key1=COMP_1_TRAIL_LIMIT+((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE;
\r
1677 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
\r
1680 if(key1>(firstUnit=compositions.charAt(list))) {
\r
1681 list+=2+(firstUnit&COMP_1_TRIPLE);
\r
1682 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
\r
1683 if(key2>(secondUnit=compositions.charAt(list+1))) {
\r
1684 if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
\r
1689 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
\r
1690 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
\r
1702 * @param c Character which has compositions
\r
1703 * @param set recursively receives the composites from c's compositions
\r
1705 private void addComposites(int list, UnicodeSet set) {
\r
1706 int firstUnit, compositeAndFwd;
\r
1708 firstUnit=maybeYesCompositions.charAt(list);
\r
1709 if((firstUnit&COMP_1_TRIPLE)==0) {
\r
1710 compositeAndFwd=maybeYesCompositions.charAt(list+1);
\r
1713 compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
\r
1714 maybeYesCompositions.charAt(list+2);
\r
1717 int composite=compositeAndFwd>>1;
\r
1718 if((compositeAndFwd&1)!=0) {
\r
1719 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
\r
1721 set.add(composite);
\r
1722 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
\r
1725 * Recomposes the buffer text starting at recomposeStartIndex
\r
1726 * (which is in NFD - decomposed and canonically ordered),
\r
1727 * and truncates the buffer contents.
\r
1729 * Note that recomposition never lengthens the text:
\r
1730 * Any character consists of either one or two code units;
\r
1731 * a composition may contain at most one more code unit than the original starter,
\r
1732 * while the combining mark that is removed has at least one code unit.
\r
1734 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
\r
1735 boolean onlyContiguous) {
\r
1736 StringBuilder sb=buffer.getStringBuilder();
\r
1737 int p=recomposeStartIndex;
\r
1738 if(p==sb.length()) {
\r
1742 int starter, pRemove;
\r
1743 int compositionsList;
\r
1744 int c, compositeAndFwd;
\r
1747 boolean starterIsSupplementary;
\r
1749 // Some of the following variables are not used until we have a forward-combining starter
\r
1750 // and are only initialized now to avoid compiler warnings.
\r
1751 compositionsList=-1; // used as indicator for whether we have a forward-combining starter
\r
1753 starterIsSupplementary=false;
\r
1757 c=sb.codePointAt(p);
\r
1758 p+=Character.charCount(c);
\r
1759 norm16=getNorm16(c);
\r
1760 cc=getCCFromYesOrMaybe(norm16);
\r
1761 if( // this character combines backward and
\r
1762 isMaybe(norm16) &&
\r
1763 // we have seen a starter that combines forward and
\r
1764 compositionsList>=0 &&
\r
1765 // the backward-combining character is not blocked
\r
1766 (prevCC<cc || prevCC==0)
\r
1768 if(isJamoVT(norm16)) {
\r
1769 // c is a Jamo V/T, see if we can compose it with the previous character.
\r
1770 if(c<Hangul.JAMO_T_BASE) {
\r
1771 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
\r
1772 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
\r
1773 if(prev<Hangul.JAMO_L_COUNT) {
\r
1775 char syllable=(char)
\r
1776 (Hangul.HANGUL_BASE+
\r
1777 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
\r
1778 Hangul.JAMO_T_COUNT);
\r
1780 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
\r
1782 syllable+=t; // The next character was a Jamo T.
\r
1784 sb.setCharAt(starter, syllable);
\r
1785 // remove the Jamo V/T
\r
1786 sb.delete(pRemove, p);
\r
1791 * No "else" for Jamo T:
\r
1792 * Since the input is in NFD, there are no Hangul LV syllables that
\r
1793 * a Jamo T could combine with.
\r
1794 * All Jamo Ts are combined above when handling Jamo Vs.
\r
1796 if(p==sb.length()) {
\r
1799 compositionsList=-1;
\r
1801 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
\r
1802 // The starter and the combining mark (c) do combine.
\r
1803 int composite=compositeAndFwd>>1;
\r
1805 // Remove the combining mark.
\r
1806 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark
\r
1807 sb.delete(pRemove, p);
\r
1809 // Replace the starter with the composite.
\r
1810 if(starterIsSupplementary) {
\r
1811 if(composite>0xffff) {
\r
1812 // both are supplementary
\r
1813 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
\r
1814 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
\r
1816 sb.setCharAt(starter, (char)c);
\r
1817 sb.deleteCharAt(starter+1);
\r
1818 // The composite is shorter than the starter,
\r
1819 // move the intermediate characters forward one.
\r
1820 starterIsSupplementary=false;
\r
1823 } else if(composite>0xffff) {
\r
1824 // The composite is longer than the starter,
\r
1825 // move the intermediate characters back one.
\r
1826 starterIsSupplementary=true;
\r
1827 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
\r
1828 sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
\r
1831 // both are on the BMP
\r
1832 sb.setCharAt(starter, (char)composite);
\r
1835 // Keep prevCC because we removed the combining mark.
\r
1837 if(p==sb.length()) {
\r
1840 // Is the composite a starter that combines forward?
\r
1841 if((compositeAndFwd&1)!=0) {
\r
1843 getCompositionsListForComposite(getNorm16(composite));
\r
1845 compositionsList=-1;
\r
1848 // We combined; continue with looking for compositions.
\r
1853 // no combination this time
\r
1855 if(p==sb.length()) {
\r
1859 // If c did not combine, then check if it is a starter.
\r
1861 // Found a new starter.
\r
1862 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
\r
1863 // It may combine with something, prepare for it.
\r
1865 starterIsSupplementary=false;
\r
1868 starterIsSupplementary=true;
\r
1872 } else if(onlyContiguous) {
\r
1873 // FCC: no discontiguous compositions; any intervening character blocks.
\r
1874 compositionsList=-1;
\r
1881 * Does c have a composition boundary before it?
\r
1882 * True if its decomposition begins with a character that has
\r
1883 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
\r
1884 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
\r
1885 * (isCompYesAndZeroCC()) so we need not decompose.
\r
1887 private boolean hasCompBoundaryBefore(int c, int norm16) {
\r
1889 if(isCompYesAndZeroCC(norm16)) {
\r
1891 } else if(isMaybeOrNonZeroCC(norm16)) {
\r
1893 } else if(isDecompNoAlgorithmic(norm16)) {
\r
1894 c=mapAlgorithmic(c, norm16);
\r
1895 norm16=getNorm16(c);
\r
1897 // c decomposes, get everything from the variable-length extra data
\r
1898 int firstUnit=extraData.charAt(norm16++);
\r
1899 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
\r
1902 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16++)&0xff00)!=0) {
\r
1903 return false; // non-zero leadCC
\r
1905 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16)));
\r
1909 private int findPreviousCompBoundary(CharSequence s, int p) {
\r
1911 int c=Character.codePointBefore(s, p);
\r
1912 p-=Character.charCount(c);
\r
1913 if(hasCompBoundaryBefore(c)) {
\r
1916 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
\r
1917 // but that's probably not worth the extra cost.
\r
1921 private int findNextCompBoundary(CharSequence s, int p, int limit) {
\r
1923 int c=Character.codePointAt(s, p);
\r
1924 int norm16=normTrie.get(c);
\r
1925 if(hasCompBoundaryBefore(c, norm16)) {
\r
1928 p+=Character.charCount(c);
\r
1933 private int findPreviousFCDBoundary(CharSequence s, int p) {
\r
1935 int c=Character.codePointBefore(s, p);
\r
1936 p-=Character.charCount(c);
\r
1937 if(fcdTrie.get(c)<=0xff) {
\r
1943 private int findNextFCDBoundary(CharSequence s, int p, int limit) {
\r
1945 int c=Character.codePointAt(s, p);
\r
1946 int fcd16=fcdTrie.get(c);
\r
1950 p+=Character.charCount(c);
\r
1955 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
\r
1956 int canonValue=newData.get(decompLead);
\r
1957 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
\r
1958 // origin is the first character whose decomposition starts with
\r
1959 // the character for which we are setting the value.
\r
1960 newData.set(decompLead, canonValue|origin);
\r
1962 // origin is not the first character, or it is U+0000.
\r
1964 if((canonValue&CANON_HAS_SET)==0) {
\r
1965 int firstOrigin=canonValue&CANON_VALUE_MASK;
\r
1966 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
\r
1967 newData.set(decompLead, canonValue);
\r
1968 canonStartSets.add(set=new UnicodeSet());
\r
1969 if(firstOrigin!=0) {
\r
1970 set.add(firstOrigin);
\r
1973 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
\r
1979 @SuppressWarnings("unused")
\r
1980 private VersionInfo dataVersion;
\r
1982 // Code point thresholds for quick check codes.
\r
1983 private int minDecompNoCP;
\r
1984 private int minCompNoMaybeCP;
\r
1986 // Norm16 value thresholds for quick check combinations and types of extra data.
\r
1987 private int minYesNo;
\r
1988 private int minNoNo;
\r
1989 private int limitNoNo;
\r
1990 private int minMaybeYes;
\r
1992 private Trie2_16 normTrie;
\r
1993 private String maybeYesCompositions;
\r
1994 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
\r
1996 private Trie2_16 fcdTrie;
\r
1997 private Trie2_32 canonIterData;
\r
1998 private ArrayList<UnicodeSet> canonStartSets;
\r
2000 // bits in canonIterData
\r
2001 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
\r
2002 private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
\r
2003 private static final int CANON_HAS_SET = 0x200000;
\r
2004 private static final int CANON_VALUE_MASK = 0x1fffff;
\r