/*
*******************************************************************************
-* Copyright (C) 2009-2011, International Business Machines
+* Copyright (C) 2009-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
throw new RuntimeException(e);
}
}
+
+ /**
+ * Decomposes c, which must be a Hangul syllable, into buffer.
+ * This is the raw, not recursive, decomposition. Its length is always 2.
+ */
+ public static void getRawDecomposition(int c, Appendable buffer) {
+ try {
+ int orig=c;
+ c-=HANGUL_BASE;
+ int c2=c%JAMO_T_COUNT;
+ if(c2==0) {
+ c/=JAMO_T_COUNT;
+ buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
+ buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
+ } else {
+ buffer.append((char)(orig-c2)); // LV syllable
+ buffer.append((char)(JAMO_T_BASE+c2));
+ }
+ } catch(IOException e) {
+ // Will not occur because we do not write to I/O.
+ throw new RuntimeException(e);
+ }
+ }
}
/**
* is it a lead surrogate?
* @param c code unit or code point
* @return true or false
- * @draft ICU 4.6
*/
public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
/**
* @param s1 first sequence
* @param s2 second sequence
* @return true if s1 contains the same text as s2
- * @draft ICU 4.6
*/
public static boolean equal(CharSequence s1, CharSequence s2) {
if(s1==s2) {
* @param limit2 limit offset in second sequence
* @return true if s1.subSequence(start1, limit1) contains the same text
* as s2.subSequence(start2, limit2)
- * @draft ICU 4.6
*/
public static boolean equal(CharSequence s1, int start1, int limit1,
CharSequence s2, int start2, int limit2) {
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
- return version[0]==1;
+ return version[0]==2;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo=inIndexes[IX_MIN_YES_NO];
+ minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
minNoNo=inIndexes[IX_MIN_NO_NO];
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
// Read the composition and mapping data.
offset=nextOffset;
- nextOffset=inIndexes[IX_RESERVED2_OFFSET];
+ nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
int numChars=(nextOffset-offset)/2;
char[] chars;
if(numChars!=0) {
maybeYesCompositions=new String(chars);
extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
}
+
+ // smallFCD: new in formatVersion 2
+ offset=nextOffset;
+ smallFCD=new byte[0x100];
+ for(int i=0; i<0x100; ++i) {
+ smallFCD[i]=ds.readByte();
+ }
+
+ // Build tccc180[].
+ // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
+ tccc180=new int[0x180];
+ int bits=0;
+ for(int c=0; c<0x180; bits>>=1) {
+ if((c&0xff)==0) {
+ bits=smallFCD[c>>8]; // one byte per 0x100 code points
+ }
+ if((bits&1)!=0) {
+ for(int i=0; i<0x20; ++i, ++c) {
+ tccc180[c]=getFCD16FromNormData(c)&0xff;
+ }
+ } else {
+ c+=0x20;
+ }
+ }
+
data.close();
return this;
} catch(IOException e) {
// low-level properties ------------------------------------------------ ***
public Trie2_16 getNormTrie() { return normTrie; }
- /**
- * Builds and returns the FCD trie based on the data used in this instance.
- * This is required before any of {@link #getFCD16(int)} or
- * {@link #getFCD16FromSingleLead(char)} are called,
- * or else they crash.
- * This method is called automatically by Normalizer2.getInstance(..., Mode.FCD).
- * @return The FCD trie for this instance's data.
- */
- public synchronized Trie2_16 getFCDTrie() {
- if(fcdTrie!=null) {
- return fcdTrie;
- }
- Trie2Writable newFCDTrie=new Trie2Writable(0, 0);
- Iterator<Trie2.Range> trieIterator=normTrie.iterator();
- Trie2.Range range;
- while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
- // Set the FCD value for a range of same-norm16 characters.
- if(range.value!=0) {
- setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);
- }
- }
- for(char lead=0xd800; lead<0xdc00; ++lead) {
- // Collect (OR together) the FCD values for a range of supplementary characters,
- // for their lead surrogate code unit.
- int oredValue=newFCDTrie.get(lead);
- trieIterator=normTrie.iteratorForLeadSurrogate(lead);
- while(trieIterator.hasNext()) {
- oredValue|=trieIterator.next().value;
- }
- if(oredValue!=0) {
- // Set a "bad" value for makeFCD() to break the quick check loop
- // and look up the value for the supplementary code point.
- // If there is any lccc, then set the worst-case lccc of 1.
- // The ORed-together value's tccc is already the worst case.
- if(oredValue>0xff) {
- oredValue=0x100|(oredValue&0xff);
- }
- newFCDTrie.setForLeadSurrogateCodeUnit(lead, oredValue);
- }
- }
- return fcdTrie=newFCDTrie.toTrie2_16();
- }
+
+ // Note: Normalizer2Impl.java r30983 (2011-nov-27)
+ // still had getFCDTrie() which built and cached an FCD trie.
+ // That provided faster access to FCD data than getFCD16FromNormData()
+ // but required synchronization and consumed some 10kB of heap memory
+ // in any process that uses FCD (e.g., via collation).
+ // tccc180[] and smallFCD[] are intended to help with any loss of performance,
+ // at least for Latin & CJK.
/**
* Builds the canonical-iterator data for this instance.
}
if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
// c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16_2++);
+ int firstUnit=extraData.charAt(norm16_2);
int length=firstUnit&MAPPING_LENGTH_MASK;
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {
+ if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) {
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
}
- ++norm16_2;
}
// Skip empty mappings (no characters in the decomposition).
if(length!=0) {
+ ++norm16_2; // skip over the firstUnit
// add c to first code point's start set
int limit=norm16_2+length;
c2=extraData.codePointAt(norm16_2);
/**
* Returns the FCD data for code point c.
- * <b>{@link #getFCDTrie()} must have been called before this method,
- * or else this method will crash.</b>
- * @param c A Unicode code point.
- * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
- */
- public int getFCD16(int c) { return fcdTrie.get(c); }
- /**
- * Returns the FCD data for the single-or-lead code unit c.
- * <b>{@link #getFCDTrie()} must have been called before this method,
- * or else this method will crash.</b>
* @param c A Unicode code point.
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
- public int getFCD16FromSingleLead(char c) { return fcdTrie.getFromU16SingleLead(c); }
+ public int getFCD16(int c) {
+ if(c<0) {
+ return 0;
+ } else if(c<0x180) {
+ return tccc180[c];
+ } else if(c<=0xffff) {
+ if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
+ }
+ return getFCD16FromNormData(c);
+ }
+ /** Returns the FCD data for U+0000<=c<U+0180. */
+ public int getFCD16FromBelow180(int c) { return tccc180[c]; }
+ /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
+ public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
+ // 0<=lead<=0xffff
+ byte bits=smallFCD[lead>>8];
+ if(bits==0) { return false; }
+ return ((bits>>((lead>>5)&7))&1)!=0;
+ }
- private void setFCD16FromNorm16(int start, int end, int norm16, Trie2Writable newFCDTrie) {
+ /** Gets the FCD value from the regular normalization data. */
+ public int getFCD16FromNormData(int c) {
// Only loops for 1:1 algorithmic mappings.
for(;;) {
- if(norm16>=MIN_NORMAL_MAYBE_YES) {
- norm16&=0xff;
- norm16|=norm16<<8;
- } else if(norm16<=minYesNo || minMaybeYes<=norm16) {
+ int norm16=getNorm16(c);
+ if(norm16<=minYesNo) {
// no decomposition or Hangul syllable, all zeros
- break;
- } else if(limitNoNo<=norm16) {
- int delta=norm16-(minMaybeYes-MAX_DELTA-1);
- if(start==end) {
- start+=delta;
- norm16=getNorm16(start);
- } else {
- // the same delta leads from different original characters to different mappings
- do {
- int c=start+delta;
- setFCD16FromNorm16(c, c, getNorm16(c), newFCDTrie);
- } while(++start<=end);
- break;
- }
+ return 0;
+ } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
+ // combining mark
+ norm16&=0xff;
+ return norm16|(norm16<<8);
+ } else if(norm16>=minMaybeYes) {
+ return 0;
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ c=mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16);
// A character that is deleted (maps to an empty string) must
// get the worst-case lccc and tccc values because arbitrary
// characters on both sides will become adjacent.
- norm16=0x1ff;
+ return 0x1ff;
} else {
+ int fcd16=firstUnit>>8; // tccc
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- norm16=extraData.charAt(norm16+1)&0xff00; // lccc
- } else {
- norm16=0;
+ fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc
}
- norm16|=firstUnit>>8; // tccc
+ return fcd16;
}
}
- newFCDTrie.setRange(start, end, norm16, true);
- break;
}
}
/**
- * Get the decomposition for one code point.
+ * Gets the decomposition for one code point.
* @param c code point
* @return c's decomposition, if it has one; returns null if it does not have a decomposition
*/
continue;
} else {
// c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16++);
- int length=firstUnit&MAPPING_LENGTH_MASK;
- if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- ++norm16;
- }
+ int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
return extraData.substring(norm16, norm16+length);
}
if(decomp<0) {
}
}
+ /**
+ * Gets the raw decomposition for one code point.
+ * @param c code point
+ * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
+ */
+ public String getRawDecomposition(int c) {
+ // We do not loop in this method because an algorithmic mapping itself
+ // becomes a final result rather than having to be decomposed recursively.
+ int norm16;
+ if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
+ // c does not decompose
+ return null;
+ } else if(isHangul(norm16)) {
+ // Hangul syllable: decompose algorithmically
+ StringBuilder buffer=new StringBuilder();
+ Hangul.getRawDecomposition(c, buffer);
+ return buffer.toString();
+ } else if(isDecompNoAlgorithmic(norm16)) {
+ return UTF16.valueOf(mapAlgorithmic(c, norm16));
+ } else {
+ // c decomposes, get everything from the variable-length extra data
+ int firstUnit=extraData.charAt(norm16);
+ int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
+ if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
+ // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
+ // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
+ int rawMapping=norm16-((firstUnit>>7)&1)-1;
+ char rm0=extraData.charAt(rawMapping);
+ if(rm0<=MAPPING_LENGTH_MASK) {
+ return extraData.substring(rawMapping-rm0, rawMapping);
+ } else {
+ // Copy the normal mapping and replace its first two code units with rm0.
+ StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
+ norm16+=1+2; // skip over the firstUnit and the first two mapping code units
+ return buffer.append(extraData, norm16, norm16+mLength-2).toString();
+ }
+ } else {
+ norm16+=1; // skip over the firstUnit
+ return extraData.substring(norm16, norm16+mLength);
+ }
+ }
+ }
+
/**
* Returns true if code point c starts a canonical-iterator string segment.
* <b>{@link #ensureCanonIterData()} must have been called before this method,
// Byte offsets from the start of the data, after the generic header.
public static final int IX_NORM_TRIE_OFFSET=0;
public static final int IX_EXTRA_DATA_OFFSET=1;
- public static final int IX_RESERVED2_OFFSET=2;
+ public static final int IX_SMALL_FCD_OFFSET=2;
+ public static final int IX_RESERVED3_OFFSET=3;
public static final int IX_TOTAL_SIZE=7;
// Code point thresholds for quick check codes.
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
+ // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
public static final int IX_MIN_YES_NO=10;
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
+ // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+ public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
+
public static final int IX_COUNT=16;
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
- public static final int MAPPING_PLUS_COMPOSITION_LIST=0x40;
+ public static final int MAPPING_HAS_RAW_MAPPING=0x40;
public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
public static final int MAPPING_LENGTH_MASK=0x1f;
if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
prevFCD16=~c;
++src;
- } else if((fcd16=fcdTrie.getFromU16SingleLead((char)c))<=0xff) {
- prevFCD16=fcd16;
+ } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
+ prevFCD16=0;
++src;
- } else if(!UTF16.isSurrogate((char)c)) {
- break;
} else {
- char c2;
- if(UTF16Plus.isSurrogateLead(c)) {
- if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
- c=Character.toCodePoint((char)c, c2);
- }
- } else /* trail surrogate */ {
- if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
- --src;
- c=Character.toCodePoint(c2, (char)c);
+ if(UTF16.isSurrogate((char)c)) {
+ char c2;
+ if(UTF16Plus.isSurrogateLead(c)) {
+ if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
+ c=Character.toCodePoint((char)c, c2);
+ }
+ } else /* trail surrogate */ {
+ if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
+ --src;
+ c=Character.toCodePoint(c2, (char)c);
+ }
}
}
- if((fcd16=getFCD16(c))<=0xff) {
+ if((fcd16=getFCD16FromNormData(c))<=0xff) {
prevFCD16=fcd16;
src+=Character.charCount(c);
} else {
// We know that the previous character's lccc==0.
if(prevFCD16<0) {
// Fetching the fcd16 value was deferred for this below-U+0300 code point.
- prevFCD16=getFCD16FromSingleLead((char)~prevFCD16);
+ int prev=~prevFCD16;
+ prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
if(prevFCD16>1) {
--prevBoundary;
}
--p;
// Need to fetch the previous character's FCD value because
// prevFCD16 was just for the trail surrogate code point.
- prevFCD16=getFCD16(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
+ prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
// Still known to have lccc==0 because its lead surrogate unit had lccc==0.
}
if(prevFCD16>1) {
c=mapAlgorithmic(c, norm16);
} else {
// c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16++);
+ int firstUnit=extraData.charAt(norm16);
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
return false;
}
// if(trailCC==1) test leadCC==0, same as checking for before-boundary
}
// true if leadCC==0 (hasFCDBoundaryBefore())
- return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16)&0xff00)==0;
+ return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
}
}
}
if(isInert(norm16)) {
return true;
} else if(norm16<=minYesNo) {
- // Hangul LVT (==minYesNo) has a boundary after it.
+ // Hangul: norm16==minYesNo
+ // Hangul LVT has a boundary after it.
// Hangul LV and non-inert yesYes characters combine forward.
return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);
} else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
// otherwise it could be a noNo.
int firstUnit=extraData.charAt(norm16);
// true if
- // c is not deleted, and
- // it and its decomposition do not combine forward, and it has a starter, and
- // if FCC then trailCC<=1
+ // not MAPPING_NO_COMP_BOUNDARY_AFTER
+ // (which is set if
+ // c is not deleted, and
+ // it and its decomposition do not combine forward, and it has a starter)
+ // and if FCC then trailCC<=1
return
- (firstUnit&MAPPING_LENGTH_MASK)!=0 &&
- (firstUnit&(MAPPING_PLUS_COMPOSITION_LIST|MAPPING_NO_COMP_BOUNDARY_AFTER))==0 &&
+ (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
(!onlyContiguous || firstUnit<=0x1ff);
}
}
private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
private static boolean isInert(int norm16) { return norm16==0; }
- // static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
+ private static boolean isJamoL(int norm16) { return norm16==1; }
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
private boolean isHangul(int norm16) { return norm16==minYesNo; }
private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
// }
private int getCCFromNoNo(int norm16) {
if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- return extraData.charAt(norm16+1)&0xff;
+ return extraData.charAt(norm16-1)&0xff;
} else {
return 0;
}
int firstUnit=extraData.charAt(norm16);
return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions
1+ // +1 to skip the first unit with the mapping lenth
- (firstUnit&MAPPING_LENGTH_MASK)+ // + mapping length
- ((firstUnit>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
+ (firstUnit&MAPPING_LENGTH_MASK); // + mapping length
}
/**
* @param c code point must have compositions
continue;
} else {
// c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16++);
+ int firstUnit=extraData.charAt(norm16);
int length=firstUnit&MAPPING_LENGTH_MASK;
int leadCC, trailCC;
trailCC=firstUnit>>8;
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
- leadCC=extraData.charAt(norm16++)>>8;
+ leadCC=extraData.charAt(norm16-1)>>8;
} else {
leadCC=0;
}
+ ++norm16; // skip over the firstUnit
buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
}
return;
}
}
- /*
+ /**
* Finds the recomposition result for
* a forward-combining "lead" character,
* specified with a pointer to its compositions list,
* and a backward-combining "trail" character.
*
- * If the lead and trail characters combine, then this function returns
+ * <p>If the lead and trail characters combine, then this function returns
* the following "compositeAndFwd" value:
+ * <pre>
* Bits 21..1 composite character
* Bit 0 set if the composite is a forward-combining starter
+ * </pre>
* otherwise it returns -1.
*
- * The compositions list has (trail, compositeAndFwd) pair entries,
+ * <p>The compositions list has (trail, compositeAndFwd) pair entries,
* encoded as either pairs or triples of 16-bit units.
* The last entry has the high bit of its first unit set.
*
- * The list is sorted by ascending trail characters (there are no duplicates).
+ * <p>The list is sorted by ascending trail characters (there are no duplicates).
* A linear search is used.
*
- * See normalizer2impl.h for a more detailed description
+ * <p>See normalizer2impl.h for a more detailed description
* of the compositions list format.
*/
private static int combine(String compositions, int list, int trail) {
buffer.flush();
}
+ public int composePair(int a, int b) {
+ int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
+ int list;
+ if(isInert(norm16)) {
+ return -1;
+ } else if(norm16<minYesNoMappingsOnly) {
+ if(isJamoL(norm16)) {
+ b-=Hangul.JAMO_V_BASE;
+ if(0<=b && b<Hangul.JAMO_V_COUNT) {
+ return
+ (Hangul.HANGUL_BASE+
+ ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
+ Hangul.JAMO_T_COUNT);
+ } else {
+ return -1;
+ }
+ } else if(isHangul(norm16)) {
+ b-=Hangul.JAMO_T_BASE;
+ if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0!
+ return a+b;
+ } else {
+ return -1;
+ }
+ } else {
+ // 'a' has a compositions list in extraData
+ list=norm16;
+ if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
+ list+= // mapping pointer
+ 1+ // +1 to skip the first unit with the mapping lenth
+ (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length
+ }
+ // Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
+ list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
+ }
+ } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
+ return -1;
+ } else {
+ list=norm16-minMaybeYes; // offset into maybeYesCompositions
+ }
+ if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
+ return -1;
+ }
+ return combine(maybeYesCompositions, list, b)>>1;
+ }
+
/**
* Does c have a composition boundary before it?
* True if its decomposition begins with a character that has
norm16=getNorm16(c);
} else {
// c decomposes, get everything from the variable-length extra data
- int firstUnit=extraData.charAt(norm16++);
+ int firstUnit=extraData.charAt(norm16);
if((firstUnit&MAPPING_LENGTH_MASK)==0) {
return false;
}
- if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16++)&0xff00)!=0) {
+ if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
return false; // non-zero leadCC
}
- return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16)));
+ return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
}
}
}
while(p>0) {
int c=Character.codePointBefore(s, p);
p-=Character.charCount(c);
- if(fcdTrie.get(c)<=0xff) {
+ if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
break;
}
}
private int findNextFCDBoundary(CharSequence s, int p, int limit) {
while(p<limit) {
int c=Character.codePointAt(s, p);
- int fcd16=fcdTrie.get(c);
- if(fcd16<=0xff) {
+ if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
break;
}
p+=Character.charCount(c);
// Norm16 value thresholds for quick check combinations and types of extra data.
private int minYesNo;
+ private int minYesNoMappingsOnly;
private int minNoNo;
private int limitNoNo;
private int minMaybeYes;
private Trie2_16 normTrie;
private String maybeYesCompositions;
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
+ private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
+ private int[] tccc180; // [0x180] tccc values for U+0000..U+017F
- private Trie2_16 fcdTrie;
private Trie2_32 canonIterData;
private ArrayList<UnicodeSet> canonStartSets;