2 *******************************************************************************
4 * Copyright (C) 2004-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: UCaseProps.java
10 * tab size: 8 (not used)
13 * created on: 2005jan29
14 * created by: Markus W. Scherer
16 * Low-level Unicode character/string case mapping code.
17 * Java port of ucase.h/.c.
20 package com.ibm.icu.impl;
22 import java.io.BufferedInputStream;
23 import java.io.DataInputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.util.Iterator;
28 import com.ibm.icu.lang.UCharacter;
29 import com.ibm.icu.lang.UProperty;
30 import com.ibm.icu.text.UTF16;
31 import com.ibm.icu.text.UnicodeSet;
32 import com.ibm.icu.util.ULocale;
34 public final class UCaseProps {
36 // constructors etc. --------------------------------------------------- ***
38 // port of ucase_openProps()
39 private UCaseProps() throws IOException {
40 InputStream is=ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/"+DATA_FILE_NAME);
41 BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
47 private final void readData(InputStream is) throws IOException {
48 DataInputStream inputStream=new DataInputStream(is);
51 ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
55 count=inputStream.readInt();
57 throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
59 indexes=new int[count];
62 for(i=1; i<count; ++i) {
63 indexes[i]=inputStream.readInt();
67 trie=Trie2_16.createFromSerialized(inputStream);
68 int expectedTrieLength=indexes[IX_TRIE_SIZE];
69 int trieLength=trie.getSerializedLength();
70 if(trieLength>expectedTrieLength) {
71 throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
73 // skip padding after trie bytes
74 inputStream.skipBytes(expectedTrieLength-trieLength);
77 count=indexes[IX_EXC_LENGTH];
79 exceptions=new char[count];
80 for(i=0; i<count; ++i) {
81 exceptions[i]=inputStream.readChar();
86 count=indexes[IX_UNFOLD_LENGTH];
88 unfold=new char[count];
89 for(i=0; i<count; ++i) {
90 unfold[i]=inputStream.readChar();
95 // implement ICUBinary.Authenticate
96 private final static class IsAcceptable implements ICUBinary.Authenticate {
97 // @Override when we switch to Java 6
98 public boolean isDataVersionAcceptable(byte version[]) {
103 // set of property starts for UnicodeSet ------------------------------- ***
105 public final void addPropertyStarts(UnicodeSet set) {
106 /* add the start code point of each same-value range of the trie */
107 Iterator<Trie2.Range> trieIterator=trie.iterator();
109 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
110 set.add(range.startCodePoint);
113 /* add code points with hardcoded properties, plus the ones following them */
115 /* (none right now, see comment below) */
118 * Omit code points with hardcoded specialcasing properties
119 * because we do not build property UnicodeSets for them right now.
123 // data access primitives ---------------------------------------------- ***
124 private static final int getExceptionsOffset(int props) {
125 return props>>EXC_SHIFT;
128 private static final boolean propsHasException(int props) {
129 return (props&EXCEPTION)!=0;
132 /* number of bits in an 8-bit integer value */
133 private static final byte flagsOffset[/*256*/]={
134 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
135 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
136 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
137 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
138 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
139 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
140 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
141 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
142 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
143 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
144 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
145 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
146 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
147 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
148 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
149 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
152 private static final boolean hasSlot(int flags, int index) {
153 return (flags&(1<<index))!=0;
155 private static final byte slotOffset(int flags, int index) {
156 return flagsOffset[flags&((1<<index)-1)];
160 * Get the value of an optional-value slot where hasSlot(excWord, index).
162 * @param excWord (in) initial exceptions word
163 * @param index (in) desired slot index
164 * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
165 * @return bits 31..0: slot value
166 * 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
168 private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
170 if((excWord&EXC_DOUBLE_SLOTS)==0) {
171 excOffset+=slotOffset(excWord, index);
172 value=exceptions[excOffset];
174 excOffset+=2*slotOffset(excWord, index);
175 value=exceptions[excOffset++];
176 value=(value<<16)|exceptions[excOffset];
178 return value |((long)excOffset<<32);
181 /* same as getSlotValueAndOffset() but does not return the slot offset */
182 private final int getSlotValue(int excWord, int index, int excOffset) {
184 if((excWord&EXC_DOUBLE_SLOTS)==0) {
185 excOffset+=slotOffset(excWord, index);
186 value=exceptions[excOffset];
188 excOffset+=2*slotOffset(excWord, index);
189 value=exceptions[excOffset++];
190 value=(value<<16)|exceptions[excOffset];
195 // simple case mappings ------------------------------------------------ ***
197 public final int tolower(int c) {
198 int props=trie.get(c);
199 if(!propsHasException(props)) {
200 if(getTypeFromProps(props)>=UPPER) {
204 int excOffset=getExceptionsOffset(props);
205 int excWord=exceptions[excOffset++];
206 if(hasSlot(excWord, EXC_LOWER)) {
207 c=getSlotValue(excWord, EXC_LOWER, excOffset);
213 public final int toupper(int c) {
214 int props=trie.get(c);
215 if(!propsHasException(props)) {
216 if(getTypeFromProps(props)==LOWER) {
220 int excOffset=getExceptionsOffset(props);
221 int excWord=exceptions[excOffset++];
222 if(hasSlot(excWord, EXC_UPPER)) {
223 c=getSlotValue(excWord, EXC_UPPER, excOffset);
229 public final int totitle(int c) {
230 int props=trie.get(c);
231 if(!propsHasException(props)) {
232 if(getTypeFromProps(props)==LOWER) {
236 int excOffset=getExceptionsOffset(props);
237 int excWord=exceptions[excOffset++];
239 if(hasSlot(excWord, EXC_TITLE)) {
241 } else if(hasSlot(excWord, EXC_UPPER)) {
246 c=getSlotValue(excWord, index, excOffset);
252 * Adds all simple case mappings and the full case folding for c to sa,
253 * and also adds special case closure mappings.
254 * c itself is not added.
255 * For example, the mappings
256 * - for s include long s
257 * - for sharp s include ss
258 * - for k include the Kelvin sign
260 public final void addCaseClosure(int c, UnicodeSet set) {
262 * Hardcode the case closure of i and its relatives and ignore the
263 * data file data for these characters.
264 * The Turkic dotless i and dotted I with their case mapping conditions
265 * and case folding option make the related characters behave specially.
266 * This code matches their closure behavior to their case folding behavior.
271 /* regular i and I are in one equivalence class */
278 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
282 /* dotless i is in a class by itself */
285 /* otherwise use the data file data */
289 int props=trie.get(c);
290 if(!propsHasException(props)) {
291 if(getTypeFromProps(props)!=NONE) {
292 /* add the one simple case mapping, no matter what type it is */
293 int delta=getDelta(props);
300 * c has exceptions, so there may be multiple simple and/or
301 * full case mappings. Add them all.
303 int excOffset0, excOffset=getExceptionsOffset(props);
305 int excWord=exceptions[excOffset++];
306 int index, closureLength, fullLength, length;
308 excOffset0=excOffset;
310 /* add all simple case mappings */
311 for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
312 if(hasSlot(excWord, index)) {
313 excOffset=excOffset0;
314 c=getSlotValue(excWord, index, excOffset);
319 /* get the closure string pointer & length */
320 if(hasSlot(excWord, EXC_CLOSURE)) {
321 excOffset=excOffset0;
322 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
323 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
324 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
330 /* add the full case folding */
331 if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
332 excOffset=excOffset0;
333 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
334 fullLength=(int)value;
336 /* start of full case mapping strings */
337 excOffset=(int)(value>>32)+1;
339 fullLength&=0xffff; /* bits 16 and higher are reserved */
341 /* skip the lowercase result string */
342 excOffset+=fullLength&FULL_LOWER;
345 /* add the full case folding string */
346 length=fullLength&0xf;
348 set.add(new String(exceptions, excOffset, length));
352 /* skip the uppercase and titlecase strings */
354 excOffset+=fullLength&0xf;
356 excOffset+=fullLength;
358 closureOffset=excOffset; /* behind full case mappings */
361 /* add each code point in the closure string */
362 for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) {
363 c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index);
370 * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
371 * must be s.length()>0 and max>0 and s.length()<=max
373 private final int strcmpMax(String s, int unfoldOffset, int max) {
374 int i1, length, c1, c2;
377 max-=length; /* we require length<=max, so no need to decrement max in the loop */
381 c2=unfold[unfoldOffset++];
383 return 1; /* reached the end of t but not of s */
387 return c1; /* return difference result */
390 /* ends with length==0 */
392 if(max==0 || unfold[unfoldOffset]==0) {
393 return 0; /* equal to length of both strings */
395 return -max; /* return lengh difference */
400 * Maps the string to single code points and adds the associated case closure
402 * The string is mapped to code points if it is their full case folding string.
403 * In other words, this performs a reverse full case folding and then
404 * adds the case closure items of the resulting code points.
405 * If the string is found and its closure applied, then
406 * the string itself is added as well as part of its code points' closure.
408 * @return true if the string was found
410 public final boolean addStringCaseClosure(String s, UnicodeSet set) {
411 int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
413 if(unfold==null || s==null) {
414 return false; /* no reverse case folding data, or no string */
418 /* the string is too short to find any match */
420 * more precise would be:
421 * if(!u_strHasMoreChar32Than(s, length, 1))
422 * but this does not make much practical difference because
423 * a single supplementary code point would just not be found
428 unfoldRows=unfold[UNFOLD_ROWS];
429 unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
430 unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
431 //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
433 if(length>unfoldStringWidth) {
434 /* the string is too long to find any match */
438 /* do a binary search for the string */
443 unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
444 result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
447 /* found the string: add each code point, and its case closure */
450 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
451 c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
453 addCaseClosure(c, set);
456 } else if(result<0) {
458 } else /* result>0 */ {
463 return false; /* string not found */
466 /** @return NONE, LOWER, UPPER, TITLE */
467 public final int getType(int c) {
468 return getTypeFromProps(trie.get(c));
471 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
472 public final int getTypeOrIgnorable(int c) {
473 return getTypeAndIgnorableFromProps(trie.get(c));
476 /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
477 public final int getDotType(int c) {
478 int props=trie.get(c);
479 if(!propsHasException(props)) {
480 return props&DOT_MASK;
482 return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK;
486 public final boolean isSoftDotted(int c) {
487 return getDotType(c)==SOFT_DOTTED;
490 public final boolean isCaseSensitive(int c) {
491 return (trie.get(c)&SENSITIVE)!=0;
494 // string casing ------------------------------------------------------- ***
497 * These internal functions form the core of string case mappings.
498 * They map single code points to result code points or strings and take
499 * all necessary conditions (context, locale ID, options) into account.
501 * They do not iterate over the source or write to the destination
502 * so that the same functions are useful for non-standard string storage,
503 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
504 * For the same reason, the "surrounding text" context is passed in as a
505 * ContextIterator which does not make any assumptions about
506 * the underlying storage.
508 * This section contains helper functions that check for conditions
509 * in the input text surrounding the current code point
510 * according to SpecialCasing.txt.
512 * Each helper function gets the index
513 * - after the current code point if it looks at following text
514 * - before the current code point if it looks at preceding text
516 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
519 * C is preceded by a sequence consisting of
520 * a cased letter and a case-ignorable sequence,
521 * and C is not followed by a sequence consisting of
522 * an ignorable sequence and then a cased letter.
525 * C is followed by one or more characters of combining class 230 (ABOVE)
526 * in the combining character sequence.
529 * The last preceding character with combining class of zero before C
531 * and there is no intervening combining character class 230 (ABOVE).
534 * C is followed by combining dot above (U+0307).
535 * Any sequence of characters with a combining class that is neither 0 nor 230
536 * may intervene between the current character and the combining dot above.
538 * The erratum from 2002-10-31 adds the condition
541 * The last preceding base character was an uppercase I, and there is no
542 * intervening combining character class 230 (ABOVE).
544 * (See Jitterbug 2344 and the comments on After_I below.)
546 * Helper definitions in Unicode 3.2 UAX 21:
548 * D1. A character C is defined to be cased
549 * if it meets any of the following criteria:
551 * - The general category of C is Titlecase Letter (Lt)
552 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
553 * - Given D = NFD(C), then it is not the case that:
554 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
555 * (This third criterium does not add any characters to the list
556 * for Unicode 3.2. Ignored.)
558 * D2. A character C is defined to be case-ignorable
559 * if it meets either of the following criteria:
561 * - The general category of C is
562 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
563 * Letter Modifier (Lm), or Symbol Modifier (Sk)
564 * - C is one of the following characters
566 * U+00AD SOFT HYPHEN (SHY)
567 * U+2019 RIGHT SINGLE QUOTATION MARK
568 * (the preferred character for apostrophe)
570 * D3. A case-ignorable sequence is a sequence of
571 * zero or more case-ignorable characters.
575 * Iterator for string case mappings, which need to look at the
576 * context (surrounding text) of a given character for conditional mappings.
578 * The iterator only needs to go backward or forward away from the
579 * character in question. It does not use any indexes on this interface.
580 * It does not support random access or an arbitrary change of
581 * iteration direction.
583 * The code point being case-mapped itself is never returned by
586 public interface ContextIterator {
588 * Reset the iterator for forward or backward iteration.
589 * @param dir >0: Begin iterating forward from the first code point
590 * after the one that is being case-mapped.
591 * <0: Begin iterating backward from the first code point
592 * before the one that is being case-mapped.
594 public void reset(int dir);
596 * Iterate and return the next code point, moving in the direction
597 * determined by the reset() call.
598 * @return Next code point, or <0 when the iteration is done.
604 * For string case mappings, a single character (a code point) is mapped
605 * either to itself (in which case in-place mapping functions do nothing),
606 * or to another single code point, or to a string.
607 * Aside from the string contents, these are indicated with a single int
610 * Mapping to self: Negative values (~self instead of -self to support U+0000)
612 * Mapping to another code point: Positive values >MAX_STRING_LENGTH
614 * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
615 * returned. Note that the string result may indeed have zero length.
617 public static final int MAX_STRING_LENGTH=0x1f;
619 private static final int LOC_UNKNOWN=0;
620 private static final int LOC_ROOT=1;
621 private static final int LOC_TURKISH=2;
622 private static final int LOC_LITHUANIAN=3;
625 * Checks and caches the type of locale ID as it is relevant for case mapping.
626 * If the locCache is not null, then it must be initialized with locCache[0]=0 .
628 private static final int getCaseLocale(ULocale locale, int[] locCache) {
631 if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) {
637 String language=locale.getLanguage();
638 if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) {
640 } else if(language.equals("lt") || language.equals("lit")) {
641 result=LOC_LITHUANIAN;
650 /* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
651 private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
658 for(iter.reset(dir); (c=iter.next())>=0;) {
659 int type=getTypeOrIgnorable(c);
661 /* case-ignorable, continue with the loop */
662 } else if(type!=NONE) {
663 return true; /* followed by cased letter */
665 return false; /* uncased and not case-ignorable */
669 return false; /* not followed by cased letter */
672 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
673 private final boolean isPrecededBySoftDotted(ContextIterator iter) {
681 for(iter.reset(-1); (c=iter.next())>=0;) {
682 dotType=getDotType(c);
683 if(dotType==SOFT_DOTTED) {
684 return true; /* preceded by TYPE_i */
685 } else if(dotType!=OTHER_ACCENT) {
686 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
690 return false; /* not preceded by TYPE_i */
694 * See Jitterbug 2344:
695 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
696 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
697 * we made those releases compatible with Unicode 3.2 which had not fixed
698 * a related bug in SpecialCasing.txt.
700 * From the Jitterbug 2344 text:
701 * ... this bug is listed as a Unicode erratum
702 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
704 * There are two errors in SpecialCasing.txt.
705 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
706 * 2. An incorrect context definition. Correct as follows:
707 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
708 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
710 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
711 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
712 * where the context After_I is defined as:
713 * The last preceding base character was an uppercase I, and there is no
714 * intervening combining character class 230 (ABOVE).
717 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
719 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
720 * # This matches the behavior of the canonically equivalent I-dot_above
722 * See also the description in this place in older versions of uchar.c (revision 1.100).
724 * Markus W. Scherer 2003-feb-15
727 /* Is preceded by base character 'I' with no intervening cc=230 ? */
728 private final boolean isPrecededBy_I(ContextIterator iter) {
736 for(iter.reset(-1); (c=iter.next())>=0;) {
738 return true; /* preceded by I */
740 dotType=getDotType(c);
741 if(dotType!=OTHER_ACCENT) {
742 return false; /* preceded by different base character (not I), or intervening cc==230 */
746 return false; /* not preceded by I */
749 /* Is followed by one or more cc==230 ? */
750 private final boolean isFollowedByMoreAbove(ContextIterator iter) {
758 for(iter.reset(1); (c=iter.next())>=0;) {
759 dotType=getDotType(c);
761 return true; /* at least one cc==230 following */
762 } else if(dotType!=OTHER_ACCENT) {
763 return false; /* next base character, no more cc==230 following */
767 return false; /* no more cc==230 following */
770 /* Is followed by a dot above (without cc==230 in between) ? */
771 private final boolean isFollowedByDotAbove(ContextIterator iter) {
779 for(iter.reset(1); (c=iter.next())>=0; ) {
783 dotType=getDotType(c);
784 if(dotType!=OTHER_ACCENT) {
785 return false; /* next base character or cc==230 in between */
789 return false; /* no dot above following */
792 private static final String
795 iOgonekDot= "\u012f\u0307",
796 iDotGrave= "i\u0307\u0300",
797 iDotAcute= "i\u0307\u0301",
798 iDotTilde= "i\u0307\u0303";
801 * Get the full lowercase mapping for c.
803 * @param c Character to be mapped.
804 * @param iter Character iterator, used for context-sensitive mappings.
805 * See ContextIterator for details.
806 * If iter==null then a context-independent result is returned.
807 * @param out If the mapping result is a string, then it is appended to out.
808 * @param locale Locale ID for locale-dependent mappings.
809 * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
810 * the locale ID for subsequent calls.
812 * @return Output code point or string length, see MAX_STRING_LENGTH.
814 * @see ContextIterator
815 * @see #MAX_STRING_LENGTH
818 public final int toFullLower(int c, ContextIterator iter,
820 ULocale locale, int[] locCache) {
825 if(!propsHasException(props)) {
826 if(getTypeFromProps(props)>=UPPER) {
827 result=c+getDelta(props);
830 int excOffset=getExceptionsOffset(props), excOffset2;
831 int excWord=exceptions[excOffset++];
834 excOffset2=excOffset;
836 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
837 /* use hardcoded conditions and mappings */
838 int loc=getCaseLocale(locale, locCache);
841 * Test for conditional mappings first
842 * (otherwise the unconditional default mappings are always taken),
843 * then test for characters that have unconditional mappings in SpecialCasing.txt,
844 * then get the UnicodeData.txt mappings.
846 if( loc==LOC_LITHUANIAN &&
847 /* base characters, find accents above */
848 (((c==0x49 || c==0x4a || c==0x12e) &&
849 isFollowedByMoreAbove(iter)) ||
850 /* precomposed with accent above, no need to find one */
851 (c==0xcc || c==0xcd || c==0x128))
856 # Lithuanian retains the dot in a lowercase i when followed by accents.
858 # Introduce an explicit dot above when lowercasing capital I's and J's
859 # whenever there are more accents above.
860 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
862 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
863 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
864 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
865 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
866 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
867 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
870 case 0x49: /* LATIN CAPITAL LETTER I */
873 case 0x4a: /* LATIN CAPITAL LETTER J */
876 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
877 out.append(iOgonekDot);
879 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
880 out.append(iDotGrave);
882 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
883 out.append(iDotAcute);
885 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
886 out.append(iDotTilde);
889 return 0; /* will not occur */
891 /* # Turkish and Azeri */
892 } else if(loc==LOC_TURKISH && c==0x130) {
894 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
895 # The following rules handle those cases.
897 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
898 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
901 } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
903 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
904 # This matches the behavior of the canonically equivalent I-dot_above
906 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
907 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
909 return 0; /* remove the dot (continue without output) */
910 } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
912 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
914 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
915 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
918 } else if(c==0x130) {
920 # Preserve canonical equivalence for I with dot. Turkic is handled below.
922 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
926 } else if( c==0x3a3 &&
927 !isFollowedByCasedLetter(iter, 1) &&
928 isFollowedByCasedLetter(iter, -1) /* -1=preceded */
930 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
932 # Special case for final form of sigma
934 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
936 return 0x3c2; /* greek small final sigma */
938 /* no known conditional special case mapping, use a normal mapping */
940 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
941 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
942 full=(int)value&FULL_LOWER;
944 /* start of full case mapping strings */
945 excOffset=(int)(value>>32)+1;
947 /* set the output pointer to the lowercase mapping */
948 out.append(exceptions, excOffset, full);
950 /* return the string length */
955 if(hasSlot(excWord, EXC_LOWER)) {
956 result=getSlotValue(excWord, EXC_LOWER, excOffset2);
960 return (result==c) ? ~result : result;
964 private final int toUpperOrTitle(int c, ContextIterator iter,
966 ULocale locale, int[] locCache,
967 boolean upperNotTitle) {
973 if(!propsHasException(props)) {
974 if(getTypeFromProps(props)==LOWER) {
975 result=c+getDelta(props);
978 int excOffset=getExceptionsOffset(props), excOffset2;
979 int excWord=exceptions[excOffset++];
982 excOffset2=excOffset;
984 if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
985 /* use hardcoded conditions and mappings */
986 int loc=getCaseLocale(locale, locCache);
988 if(loc==LOC_TURKISH && c==0x69) {
992 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
993 # The following rules handle those cases.
995 # When uppercasing, i turns into a dotted capital I
997 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
998 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1001 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
1005 # Lithuanian retains the dot in a lowercase i when followed by accents.
1007 # Remove DOT ABOVE after "i" with upper or titlecase
1009 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1011 return 0; /* remove the dot (continue without output) */
1013 /* no known conditional special case mapping, use a normal mapping */
1015 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1016 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1017 full=(int)value&0xffff;
1019 /* start of full case mapping strings */
1020 excOffset=(int)(value>>32)+1;
1022 /* skip the lowercase and case-folding result strings */
1023 excOffset+=full&FULL_LOWER;
1025 excOffset+=full&0xf;
1031 /* skip the uppercase result string */
1032 excOffset+=full&0xf;
1037 /* set the output pointer to the result string */
1038 out.append(exceptions, excOffset, full);
1040 /* return the string length */
1045 if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1047 } else if(hasSlot(excWord, EXC_UPPER)) {
1048 /* here, titlecase is same as uppercase */
1053 result=getSlotValue(excWord, index, excOffset2);
1056 return (result==c) ? ~result : result;
1059 public final int toFullUpper(int c, ContextIterator iter,
1061 ULocale locale, int[] locCache) {
1062 return toUpperOrTitle(c, iter, out, locale, locCache, true);
1065 public final int toFullTitle(int c, ContextIterator iter,
1067 ULocale locale, int[] locCache) {
1068 return toUpperOrTitle(c, iter, out, locale, locCache, false);
1071 /* case folding ------------------------------------------------------------- */
1074 * Case folding is similar to lowercasing.
1075 * The result may be a simple mapping, i.e., a single code point, or
1076 * a full mapping, i.e., a string.
1077 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1078 * then only the lowercase mapping is stored.
1080 * Some special cases are hardcoded because their conditions cannot be
1081 * parsed and processed from CaseFolding.txt.
1083 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1085 # C: common case folding, common mappings shared by both simple and full mappings.
1086 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1087 # S: simple case folding, mappings to single characters where different from F.
1088 # T: special case for uppercase I and dotted uppercase I
1089 # - For non-Turkic languages, this mapping is normally not used.
1090 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1093 # A. To do a simple case folding, use the mappings with status C + S.
1094 # B. To do a full case folding, use the mappings with status C + F.
1096 # The mappings with status T can be used or omitted depending on the desired case-folding
1097 # behavior. (The default option is to exclude them.)
1099 * Unicode 3.2 has 'T' mappings as follows:
1101 0049; T; 0131; # LATIN CAPITAL LETTER I
1102 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1104 * while the default mappings for these code points are:
1106 0049; C; 0069; # LATIN CAPITAL LETTER I
1107 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1109 * U+0130 has no simple case folding (simple-case-folds to itself).
1113 * Bit mask for getting just the options from a string compare options word
1114 * that are relevant for case folding (of a single string or code point).
1117 private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
1119 /* return the simple case folding mapping for c */
1120 public final int fold(int c, int options) {
1121 int props=trie.get(c);
1122 if(!propsHasException(props)) {
1123 if(getTypeFromProps(props)>=UPPER) {
1127 int excOffset=getExceptionsOffset(props);
1128 int excWord=exceptions[excOffset++];
1130 if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1131 /* special case folding mappings, hardcoded */
1132 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1133 /* default mappings */
1135 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1137 } else if(c==0x130) {
1138 /* no simple case folding for U+0130 */
1142 /* Turkic mappings */
1144 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1146 } else if(c==0x130) {
1147 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1152 if(hasSlot(excWord, EXC_FOLD)) {
1154 } else if(hasSlot(excWord, EXC_LOWER)) {
1159 c=getSlotValue(excWord, index, excOffset);
1165 * Issue for canonical caseless match (UAX #21):
1166 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1167 * canonical equivalence, unlike default-option casefolding.
1168 * For example, I-grave and I + grave fold to strings that are not canonically
1170 * For more details, see the comment in unorm_compare() in unorm.cpp
1171 * and the intermediate prototype changes for Jitterbug 2021.
1172 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1174 * This did not get fixed because it appears that it is not possible to fix
1175 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1176 * together in a way that they still fold to common result strings.
1179 public final int toFullFolding(int c, StringBuilder out, int options) {
1185 if(!propsHasException(props)) {
1186 if(getTypeFromProps(props)>=UPPER) {
1187 result=c+getDelta(props);
1190 int excOffset=getExceptionsOffset(props), excOffset2;
1191 int excWord=exceptions[excOffset++];
1194 excOffset2=excOffset;
1196 if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1197 /* use hardcoded conditions and mappings */
1198 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1199 /* default mappings */
1201 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1203 } else if(c==0x130) {
1204 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1209 /* Turkic mappings */
1211 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1213 } else if(c==0x130) {
1214 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1218 } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1219 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1220 full=(int)value&0xffff;
1222 /* start of full case mapping strings */
1223 excOffset=(int)(value>>32)+1;
1225 /* skip the lowercase result string */
1226 excOffset+=full&FULL_LOWER;
1230 /* set the output pointer to the result string */
1231 out.append(exceptions, excOffset, full);
1233 /* return the string length */
1238 if(hasSlot(excWord, EXC_FOLD)) {
1240 } else if(hasSlot(excWord, EXC_LOWER)) {
1245 result=getSlotValue(excWord, index, excOffset2);
1248 return (result==c) ? ~result : result;
1251 /* case mapping properties API ---------------------------------------------- */
1253 private static final int[] rootLocCache = { LOC_ROOT };
1255 * We need a StringBuilder for multi-code point output from the
1256 * full case mapping functions. However, we do not actually use that output,
1257 * we just check whether the input character was mapped to anything else.
1258 * We use a shared StringBuilder to avoid allocating a new one in each call.
1259 * We remove its contents each time so that it does not grow large over time.
1263 public static final StringBuilder dummyStringBuilder = new StringBuilder();
1265 public final boolean hasBinaryProperty(int c, int which) {
1267 case UProperty.LOWERCASE:
1268 return LOWER==getType(c);
1269 case UProperty.UPPERCASE:
1270 return UPPER==getType(c);
1271 case UProperty.SOFT_DOTTED:
1272 return isSoftDotted(c);
1273 case UProperty.CASE_SENSITIVE:
1274 return isCaseSensitive(c);
1275 case UProperty.CASED:
1276 return NONE!=getType(c);
1277 case UProperty.CASE_IGNORABLE:
1278 return (getTypeOrIgnorable(c)>>2)!=0;
1280 * Note: The following Changes_When_Xyz are defined as testing whether
1281 * the NFD form of the input changes when Xyz-case-mapped.
1282 * However, this simpler implementation of these properties,
1283 * ignoring NFD, passes the tests.
1284 * The implementation needs to be changed if the tests start failing.
1285 * When that happens, optimizations should be used to work with the
1286 * per-single-code point ucase_toFullXyz() functions unless
1287 * the NFD form has more than one code point,
1288 * and the property starts set needs to be the union of the
1289 * start sets for normalization and case mappings.
1291 case UProperty.CHANGES_WHEN_LOWERCASED:
1292 dummyStringBuilder.setLength(0);
1293 return toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
1294 case UProperty.CHANGES_WHEN_UPPERCASED:
1295 dummyStringBuilder.setLength(0);
1296 return toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
1297 case UProperty.CHANGES_WHEN_TITLECASED:
1298 dummyStringBuilder.setLength(0);
1299 return toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
1300 /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
1301 case UProperty.CHANGES_WHEN_CASEMAPPED:
1302 dummyStringBuilder.setLength(0);
1304 toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
1305 toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
1306 toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
1312 // data members -------------------------------------------------------- ***
1313 private int indexes[];
1314 private char exceptions[];
1315 private char unfold[];
1317 private Trie2_16 trie;
1319 // data format constants ----------------------------------------------- ***
1320 private static final String DATA_NAME="ucase";
1321 private static final String DATA_TYPE="icu";
1322 private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
1325 private static final byte FMT[]={ 0x63, 0x41, 0x53, 0x45 };
1327 /* indexes into indexes[] */
1328 //private static final int IX_INDEX_TOP=0;
1329 //private static final int IX_LENGTH=1;
1330 private static final int IX_TRIE_SIZE=2;
1331 private static final int IX_EXC_LENGTH=3;
1332 private static final int IX_UNFOLD_LENGTH=4;
1334 //private static final int IX_MAX_FULL_LENGTH=15;
1335 private static final int IX_TOP=16;
1337 // definitions for 16-bit case properties word ------------------------- ***
1339 /* 2-bit constants for types of cased characters */
1340 public static final int TYPE_MASK=3;
1341 public static final int NONE=0;
1342 public static final int LOWER=1;
1343 public static final int UPPER=2;
1344 public static final int TITLE=3;
1346 private static final int getTypeFromProps(int props) {
1347 return props&TYPE_MASK;
1350 private static final int getTypeAndIgnorableFromProps(int props) {
1354 //private static final int IGNORABLE= 4;
1355 private static final int SENSITIVE= 8;
1356 private static final int EXCEPTION= 0x10;
1358 private static final int DOT_MASK= 0x60;
1359 //private static final int NO_DOT= 0; /* normal characters with cc=0 */
1360 private static final int SOFT_DOTTED= 0x20; /* soft-dotted characters with cc=0 */
1361 private static final int ABOVE= 0x40; /* "above" accents with cc=230 */
1362 private static final int OTHER_ACCENT= 0x60; /* other accent character (0<cc!=230) */
1364 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
1365 private static final int DELTA_SHIFT= 7;
1366 //private static final int DELTA_MASK= 0xff80;
1367 //private static final int MAX_DELTA= 0xff;
1368 //private static final int MIN_DELTA= (-MAX_DELTA-1);
1370 private static final int getDelta(int props) {
1371 return (short)props>>DELTA_SHIFT;
1374 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
1375 private static final int EXC_SHIFT= 5;
1376 //private static final int EXC_MASK= 0xffe0;
1377 //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
1379 /* definitions for 16-bit main exceptions word ------------------------------ */
1381 /* first 8 bits indicate values in optional slots */
1382 private static final int EXC_LOWER=0;
1383 private static final int EXC_FOLD=1;
1384 private static final int EXC_UPPER=2;
1385 private static final int EXC_TITLE=3;
1386 //private static final int EXC_4=4; /* reserved */
1387 //private static final int EXC_5=5; /* reserved */
1388 private static final int EXC_CLOSURE=6;
1389 private static final int EXC_FULL_MAPPINGS=7;
1390 //private static final int EXC_ALL_SLOTS=8; /* one past the last slot */
1392 /* each slot is 2 uint16_t instead of 1 */
1393 private static final int EXC_DOUBLE_SLOTS= 0x100;
1395 /* reserved: exception bits 11..9 */
1397 /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1398 private static final int EXC_DOT_SHIFT=7;
1400 /* normally stored in the main word, but pushed out for larger exception indexes */
1401 //private static final int EXC_DOT_MASK= 0x3000;
1402 //private static final int EXC_NO_DOT= 0;
1403 //private static final int EXC_SOFT_DOTTED= 0x1000;
1404 //private static final int EXC_ABOVE= 0x2000; /* "above" accents with cc=230 */
1405 //private static final int EXC_OTHER_ACCENT= 0x3000; /* other character (0<cc!=230) */
1407 /* complex/conditional mappings */
1408 private static final int EXC_CONDITIONAL_SPECIAL= 0x4000;
1409 private static final int EXC_CONDITIONAL_FOLD= 0x8000;
1411 /* definitions for lengths word for full case mappings */
1412 private static final int FULL_LOWER= 0xf;
1413 //private static final int FULL_FOLDING= 0xf0;
1414 //private static final int FULL_UPPER= 0xf00;
1415 //private static final int FULL_TITLE= 0xf000;
1417 /* maximum lengths */
1418 //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
1419 private static final int CLOSURE_MAX_LENGTH=0xf;
1421 /* constants for reverse case folding ("unfold") data */
1422 private static final int UNFOLD_ROWS=0;
1423 private static final int UNFOLD_ROW_WIDTH=1;
1424 private static final int UNFOLD_STRING_WIDTH=2;
1427 * public singleton instance
1429 public static final UCaseProps INSTANCE;
1431 // This static initializer block must be placed after
1432 // other static member initialization
1435 INSTANCE = new UCaseProps();
1436 } catch (IOException e) {
1437 throw new RuntimeException(e);