jars/icu4j-4_2_1-src/src/com/ibm/icu/impl/UCaseProps.java

   1 /*\r
   2 *******************************************************************************\r
   3 *\r
   4 *   Copyright (C) 2004-2007, International Business Machines\r
   5 *   Corporation and others.  All Rights Reserved.\r
   6 *\r
   7 *******************************************************************************\r
   8 *   file name:  UCaseProps.java\r
   9 *   encoding:   US-ASCII\r
  10 *   tab size:   8 (not used)\r
  11 *   indentation:4\r
  12 *\r
  13 *   created on: 2005jan29\r
  14 *   created by: Markus W. Scherer\r
  15 *\r
  16 *   Low-level Unicode character/string case mapping code.\r
  17 *   Java port of ucase.h/.c.\r
  18 */\r
  19 \r
  20 package com.ibm.icu.impl;\r
  21 \r
  22 import java.io.InputStream;\r
  23 import java.io.DataInputStream;\r
  24 import java.io.BufferedInputStream;\r
  25 import java.io.IOException;\r
  26 \r
  27 import com.ibm.icu.util.RangeValueIterator;\r
  28 import com.ibm.icu.util.ULocale;\r
  29 \r
  30 import com.ibm.icu.text.UTF16;\r
  31 import com.ibm.icu.text.UnicodeSet;\r
  32 \r
  33 import com.ibm.icu.lang.UCharacter;\r
  34 \r
  35 public final class UCaseProps {\r
  36     // constructors etc. --------------------------------------------------- ***\r
  37 \r
  38     // port of ucase_openProps()\r
  39     public UCaseProps() throws IOException {\r
  40         InputStream is=ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/"+DATA_FILE_NAME);\r
  41         BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);\r
  42         readData(b);\r
  43         b.close();\r
  44         is.close();\r
  45     }\r
  46 \r
  47     private final void readData(InputStream is) throws IOException {\r
  48         DataInputStream inputStream=new DataInputStream(is);\r
  49 \r
  50         // read the header\r
  51         ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());\r
  52 \r
  53         // read indexes[]\r
  54         int i, count;\r
  55         count=inputStream.readInt();\r
  56         if(count<IX_INDEX_TOP) {\r
  57             throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);\r
  58         }\r
  59         indexes=new int[count];\r
  60 \r
  61         indexes[0]=count;\r
  62         for(i=1; i<count; ++i) {\r
  63             indexes[i]=inputStream.readInt();\r
  64         }\r
  65 \r
  66         // read the trie\r
  67         trie=new CharTrie(inputStream, null);\r
  68 \r
  69         // read exceptions[]\r
  70         count=indexes[IX_EXC_LENGTH];\r
  71         if(count>0) {\r
  72             exceptions=new char[count];\r
  73             for(i=0; i<count; ++i) {\r
  74                 exceptions[i]=inputStream.readChar();\r
  75             }\r
  76         }\r
  77 \r
  78         // read unfold[]\r
  79         count=indexes[IX_UNFOLD_LENGTH];\r
  80         if(count>0) {\r
  81             unfold=new char[count];\r
  82             for(i=0; i<count; ++i) {\r
  83                 unfold[i]=inputStream.readChar();\r
  84             }\r
  85         }\r
  86     }\r
  87 \r
  88     // implement ICUBinary.Authenticate\r
  89     private final class IsAcceptable implements ICUBinary.Authenticate {\r
  90         public boolean isDataVersionAcceptable(byte version[]) {\r
  91             return version[0]==1 &&\r
  92                    version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;\r
  93         }\r
  94     }\r
  95 \r
  96     // UCaseProps singleton\r
  97     private static UCaseProps gCsp=null;\r
  98 \r
  99     // port of ucase_getSingleton()\r
 100     public static final synchronized UCaseProps getSingleton() throws IOException {\r
 101         if(gCsp==null) {\r
 102             gCsp=new UCaseProps();\r
 103         }\r
 104         return gCsp;\r
 105     }\r
 106 \r
 107     // UCaseProps dummy singleton\r
 108     private static UCaseProps gCspDummy=null;\r
 109 \r
 110     private UCaseProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature\r
 111         indexes=new int[IX_TOP];\r
 112         indexes[0]=IX_TOP;\r
 113         trie=new CharTrie(0, 0, null); // dummy trie, always returns 0\r
 114     }\r
 115 \r
 116     /**\r
 117      * Get a singleton dummy object, one that works with no real data.\r
 118      * This can be used when the real data is not available.\r
 119      * Using the dummy can reduce checks for available data after an initial failure.\r
 120      * Port of ucase_getDummy().\r
 121      */\r
 122     public static final synchronized UCaseProps getDummy() {\r
 123         if(gCspDummy==null) {\r
 124             gCspDummy=new UCaseProps(true);\r
 125         }\r
 126         return gCspDummy;\r
 127     }\r
 128 \r
 129     // set of property starts for UnicodeSet ------------------------------- ***\r
 130 \r
 131     public final void addPropertyStarts(UnicodeSet set) {\r
 132         /* add the start code point of each same-value range of the trie */\r
 133         TrieIterator iter=new TrieIterator(trie);\r
 134         RangeValueIterator.Element element=new RangeValueIterator.Element();\r
 135 \r
 136         while(iter.next(element)){\r
 137             set.add(element.start);\r
 138         }\r
 139 \r
 140         /* add code points with hardcoded properties, plus the ones following them */\r
 141 \r
 142         /* (none right now, see comment below) */\r
 143 \r
 144         /*\r
 145          * Omit code points with hardcoded specialcasing properties\r
 146          * because we do not build property UnicodeSets for them right now.\r
 147          */\r
 148     }\r
 149 \r
 150     // data access primitives ---------------------------------------------- ***\r
 151     private static final int getExceptionsOffset(int props) {\r
 152         return props>>EXC_SHIFT;\r
 153     }\r
 154 \r
 155     private static final boolean propsHasException(int props) {\r
 156         return (props&EXCEPTION)!=0;\r
 157     }\r
 158 \r
 159     /* number of bits in an 8-bit integer value */\r
 160     private static final byte flagsOffset[/*256*/]={\r
 161         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,\r
 162         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,\r
 163         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,\r
 164         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 165         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,\r
 166         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 167         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 168         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\r
 169         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,\r
 170         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 171         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 172         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\r
 173         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,\r
 174         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\r
 175         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,\r
 176         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8\r
 177     };\r
 178 \r
 179     private static final boolean hasSlot(int flags, int index) {\r
 180         return (flags&(1<<index))!=0;\r
 181     }\r
 182     private static final byte slotOffset(int flags, int index) {\r
 183         return flagsOffset[flags&((1<<index)-1)];\r
 184     }\r
 185 \r
 186     /*\r
 187      * Get the value of an optional-value slot where hasSlot(excWord, index).\r
 188      *\r
 189      * @param excWord (in) initial exceptions word\r
 190      * @param index (in) desired slot index\r
 191      * @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];\r
 192      * @return bits 31..0: slot value\r
 193      *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot \r
 194      */\r
 195     private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {\r
 196         long value;\r
 197         if((excWord&EXC_DOUBLE_SLOTS)==0) {\r
 198             excOffset+=slotOffset(excWord, index);\r
 199             value=exceptions[excOffset];\r
 200         } else {\r
 201             excOffset+=2*slotOffset(excWord, index);\r
 202             value=exceptions[excOffset++];\r
 203             value=(value<<16)|exceptions[excOffset];\r
 204         }\r
 205         return (long)value|((long)excOffset<<32);\r
 206     }\r
 207 \r
 208     /* same as getSlotValueAndOffset() but does not return the slot offset */\r
 209     private final int getSlotValue(int excWord, int index, int excOffset) {\r
 210         int value;\r
 211         if((excWord&EXC_DOUBLE_SLOTS)==0) {\r
 212             excOffset+=slotOffset(excWord, index);\r
 213             value=exceptions[excOffset];\r
 214         } else {\r
 215             excOffset+=2*slotOffset(excWord, index);\r
 216             value=exceptions[excOffset++];\r
 217             value=(value<<16)|exceptions[excOffset];\r
 218         }\r
 219         return value;\r
 220     }\r
 221 \r
 222     // simple case mappings ------------------------------------------------ ***\r
 223 \r
 224     public final int tolower(int c) {\r
 225         int props=trie.getCodePointValue(c);\r
 226         if(!propsHasException(props)) {\r
 227             if(getTypeFromProps(props)>=UPPER) {\r
 228                 c+=getDelta(props);\r
 229             }\r
 230         } else {\r
 231             int excOffset=getExceptionsOffset(props);\r
 232             int excWord=exceptions[excOffset++];\r
 233             if(hasSlot(excWord, EXC_LOWER)) {\r
 234                 c=getSlotValue(excWord, EXC_LOWER, excOffset);\r
 235             }\r
 236         }\r
 237         return c;\r
 238     }\r
 239 \r
 240     public final int toupper(int c) {\r
 241         int props=trie.getCodePointValue(c);\r
 242         if(!propsHasException(props)) {\r
 243             if(getTypeFromProps(props)==LOWER) {\r
 244                 c+=getDelta(props);\r
 245             }\r
 246         } else {\r
 247             int excOffset=getExceptionsOffset(props);\r
 248             int excWord=exceptions[excOffset++];\r
 249             if(hasSlot(excWord, EXC_UPPER)) {\r
 250                 c=getSlotValue(excWord, EXC_UPPER, excOffset);\r
 251             }\r
 252         }\r
 253         return c;\r
 254     }\r
 255 \r
 256     public final int totitle(int c) {\r
 257         int props=trie.getCodePointValue(c);\r
 258         if(!propsHasException(props)) {\r
 259             if(getTypeFromProps(props)==LOWER) {\r
 260                 c+=getDelta(props);\r
 261             }\r
 262         } else {\r
 263             int excOffset=getExceptionsOffset(props);\r
 264             int excWord=exceptions[excOffset++];\r
 265             int index;\r
 266             if(hasSlot(excWord, EXC_TITLE)) {\r
 267                 index=EXC_TITLE;\r
 268             } else if(hasSlot(excWord, EXC_UPPER)) {\r
 269                 index=EXC_UPPER;\r
 270             } else {\r
 271                 return c;\r
 272             }\r
 273             c=getSlotValue(excWord, index, excOffset);\r
 274         }\r
 275         return c;\r
 276     }\r
 277 \r
 278     /**\r
 279      * Adds all simple case mappings and the full case folding for c to sa,\r
 280      * and also adds special case closure mappings.\r
 281      * c itself is not added.\r
 282      * For example, the mappings\r
 283      * - for s include long s\r
 284      * - for sharp s include ss\r
 285      * - for k include the Kelvin sign\r
 286      */\r
 287     public final void addCaseClosure(int c, UnicodeSet set) {\r
 288         /*\r
 289          * Hardcode the case closure of i and its relatives and ignore the\r
 290          * data file data for these characters.\r
 291          * The Turkic dotless i and dotted I with their case mapping conditions\r
 292          * and case folding option make the related characters behave specially.\r
 293          * This code matches their closure behavior to their case folding behavior.\r
 294          */\r
 295 \r
 296         switch(c) {\r
 297         case 0x49:\r
 298             /* regular i and I are in one equivalence class */\r
 299             set.add(0x69);\r
 300             return;\r
 301         case 0x69:\r
 302             set.add(0x49);\r
 303             return;\r
 304         case 0x130:\r
 305             /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */\r
 306             set.add(iDot);\r
 307             return;\r
 308         case 0x131:\r
 309             /* dotless i is in a class by itself */\r
 310             return;\r
 311         default:\r
 312             /* otherwise use the data file data */\r
 313             break;\r
 314         }\r
 315 \r
 316         int props=trie.getCodePointValue(c);\r
 317         if(!propsHasException(props)) {\r
 318             if(getTypeFromProps(props)!=NONE) {\r
 319                 /* add the one simple case mapping, no matter what type it is */\r
 320                 int delta=getDelta(props);\r
 321                 if(delta!=0) {\r
 322                     set.add(c+delta);\r
 323                 }\r
 324             }\r
 325         } else {\r
 326             /*\r
 327              * c has exceptions, so there may be multiple simple and/or\r
 328              * full case mappings. Add them all.\r
 329              */\r
 330             int excOffset0, excOffset=getExceptionsOffset(props);\r
 331             int closureOffset;\r
 332             int excWord=exceptions[excOffset++];\r
 333             int index, closureLength, fullLength, length;\r
 334 \r
 335             excOffset0=excOffset;\r
 336 \r
 337             /* add all simple case mappings */\r
 338             for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {\r
 339                 if(hasSlot(excWord, index)) {\r
 340                     excOffset=excOffset0;\r
 341                     c=getSlotValue(excWord, index, excOffset);\r
 342                     set.add(c);\r
 343                 }\r
 344             }\r
 345 \r
 346             /* get the closure string pointer & length */\r
 347             if(hasSlot(excWord, EXC_CLOSURE)) {\r
 348                 excOffset=excOffset0;\r
 349                 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);\r
 350                 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */\r
 351                 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */\r
 352             } else {\r
 353                 closureLength=0;\r
 354                 closureOffset=0;\r
 355             }\r
 356 \r
 357             /* add the full case folding */\r
 358             if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {\r
 359                 excOffset=excOffset0;\r
 360                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);\r
 361                 fullLength=(int)value;\r
 362 \r
 363                 /* start of full case mapping strings */\r
 364                 excOffset=(int)(value>>32)+1;\r
 365 \r
 366                 fullLength&=0xffff; /* bits 16 and higher are reserved */\r
 367 \r
 368                 /* skip the lowercase result string */\r
 369                 excOffset+=fullLength&FULL_LOWER;\r
 370                 fullLength>>=4;\r
 371 \r
 372                 /* add the full case folding string */\r
 373                 length=fullLength&0xf;\r
 374                 if(length!=0) {\r
 375                     set.add(new String(exceptions, excOffset, length));\r
 376                     excOffset+=length;\r
 377                 }\r
 378 \r
 379                 /* skip the uppercase and titlecase strings */\r
 380                 fullLength>>=4;\r
 381                 excOffset+=fullLength&0xf;\r
 382                 fullLength>>=4;\r
 383                 excOffset+=fullLength;\r
 384 \r
 385                 closureOffset=excOffset; /* behind full case mappings */\r
 386             }\r
 387 \r
 388             /* add each code point in the closure string */\r
 389             for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) {\r
 390                 c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index);\r
 391                 set.add(c);\r
 392             }\r
 393         }\r
 394     }\r
 395 \r
 396     /*\r
 397      * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated\r
 398      * must be s.length()>0 and max>0 and s.length()<=max\r
 399      */\r
 400     private final int strcmpMax(String s, int unfoldOffset, int max) {\r
 401         int i1, length, c1, c2;\r
 402 \r
 403         length=s.length();\r
 404         max-=length; /* we require length<=max, so no need to decrement max in the loop */\r
 405         i1=0;\r
 406         do {\r
 407             c1=s.charAt(i1++);\r
 408             c2=unfold[unfoldOffset++];\r
 409             if(c2==0) {\r
 410                 return 1; /* reached the end of t but not of s */\r
 411             }\r
 412             c1-=c2;\r
 413             if(c1!=0) {\r
 414                 return c1; /* return difference result */\r
 415             }\r
 416         } while(--length>0);\r
 417         /* ends with length==0 */\r
 418 \r
 419         if(max==0 || unfold[unfoldOffset]==0) {\r
 420             return 0; /* equal to length of both strings */\r
 421         } else {\r
 422             return -max; /* return lengh difference */\r
 423         }\r
 424     }\r
 425 \r
 426     /**\r
 427      * Maps the string to single code points and adds the associated case closure\r
 428      * mappings.\r
 429      * The string is mapped to code points if it is their full case folding string.\r
 430      * In other words, this performs a reverse full case folding and then\r
 431      * adds the case closure items of the resulting code points.\r
 432      * If the string is found and its closure applied, then\r
 433      * the string itself is added as well as part of its code points' closure.\r
 434      *\r
 435      * @return true if the string was found\r
 436      */\r
 437     public final boolean addStringCaseClosure(String s, UnicodeSet set) {\r
 438         int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;\r
 439 \r
 440         if(unfold==null || s==null) {\r
 441             return false; /* no reverse case folding data, or no string */\r
 442         }\r
 443         length=s.length();\r
 444         if(length<=1) {\r
 445             /* the string is too short to find any match */\r
 446             /*\r
 447              * more precise would be:\r
 448              * if(!u_strHasMoreChar32Than(s, length, 1))\r
 449              * but this does not make much practical difference because\r
 450              * a single supplementary code point would just not be found\r
 451              */\r
 452             return false;\r
 453         }\r
 454 \r
 455         unfoldRows=unfold[UNFOLD_ROWS];\r
 456         unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];\r
 457         unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];\r
 458         //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;\r
 459 \r
 460         if(length>unfoldStringWidth) {\r
 461             /* the string is too long to find any match */\r
 462             return false;\r
 463         }\r
 464 \r
 465         /* do a binary search for the string */\r
 466         start=0;\r
 467         limit=unfoldRows;\r
 468         while(start<limit) {\r
 469             i=(start+limit)/2;\r
 470             unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above\r
 471             result=strcmpMax(s, unfoldOffset, unfoldStringWidth);\r
 472 \r
 473             if(result==0) {\r
 474                 /* found the string: add each code point, and its case closure */\r
 475                 int c;\r
 476 \r
 477                 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {\r
 478                     c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);\r
 479                     set.add(c);\r
 480                     addCaseClosure(c, set);\r
 481                 }\r
 482                 return true;\r
 483             } else if(result<0) {\r
 484                 limit=i;\r
 485             } else /* result>0 */ {\r
 486                 start=i+1;\r
 487             }\r
 488         }\r
 489 \r
 490         return false; /* string not found */\r
 491     }\r
 492 \r
 493     /** @return NONE, LOWER, UPPER, TITLE */\r
 494     public final int getType(int c) {\r
 495         return getTypeFromProps(trie.getCodePointValue(c));\r
 496     }\r
 497 \r
 498     /** @return same as getType(), or <0 if c is case-ignorable */\r
 499     public final int getTypeOrIgnorable(int c) {\r
 500         int props=trie.getCodePointValue(c);\r
 501         int type=getTypeFromProps(props);\r
 502         if(type!=NONE) {\r
 503             return type;\r
 504         } else if(\r
 505             c==0x307 ||\r
 506             (props&(EXCEPTION|CASE_IGNORABLE))==CASE_IGNORABLE\r
 507         ) {\r
 508             return -1; /* case-ignorable */\r
 509         } else {\r
 510             return 0; /* c is neither cased nor case-ignorable */\r
 511         }\r
 512     }\r
 513 \r
 514     /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */\r
 515     public final int getDotType(int c) {\r
 516         int props=trie.getCodePointValue(c);\r
 517         if(!propsHasException(props)) {\r
 518             return props&DOT_MASK;\r
 519         } else {\r
 520             return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK;\r
 521         }\r
 522     }\r
 523 \r
 524     public final boolean isSoftDotted(int c) {\r
 525         return getDotType(c)==SOFT_DOTTED;\r
 526     }\r
 527 \r
 528     public final boolean isCaseSensitive(int c) {\r
 529         return (trie.getCodePointValue(c)&SENSITIVE)!=0;\r
 530     }\r
 531 \r
 532     // string casing ------------------------------------------------------- ***\r
 533 \r
 534     /*\r
 535      * These internal functions form the core of string case mappings.\r
 536      * They map single code points to result code points or strings and take\r
 537      * all necessary conditions (context, locale ID, options) into account.\r
 538      *\r
 539      * They do not iterate over the source or write to the destination\r
 540      * so that the same functions are useful for non-standard string storage,\r
 541      * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.\r
 542      * For the same reason, the "surrounding text" context is passed in as a\r
 543      * ContextIterator which does not make any assumptions about\r
 544      * the underlying storage.\r
 545      *\r
 546      * This section contains helper functions that check for conditions\r
 547      * in the input text surrounding the current code point\r
 548      * according to SpecialCasing.txt.\r
 549      *\r
 550      * Each helper function gets the index\r
 551      * - after the current code point if it looks at following text\r
 552      * - before the current code point if it looks at preceding text\r
 553      *\r
 554      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:\r
 555      *\r
 556      * Final_Sigma\r
 557      *   C is preceded by a sequence consisting of\r
 558      *     a cased letter and a case-ignorable sequence,\r
 559      *   and C is not followed by a sequence consisting of\r
 560      *     an ignorable sequence and then a cased letter.\r
 561      *\r
 562      * More_Above\r
 563      *   C is followed by one or more characters of combining class 230 (ABOVE)\r
 564      *   in the combining character sequence.\r
 565      *\r
 566      * After_Soft_Dotted\r
 567      *   The last preceding character with combining class of zero before C\r
 568      *   was Soft_Dotted,\r
 569      *   and there is no intervening combining character class 230 (ABOVE).\r
 570      *\r
 571      * Before_Dot\r
 572      *   C is followed by combining dot above (U+0307).\r
 573      *   Any sequence of characters with a combining class that is neither 0 nor 230\r
 574      *   may intervene between the current character and the combining dot above.\r
 575      *\r
 576      * The erratum from 2002-10-31 adds the condition\r
 577      *\r
 578      * After_I\r
 579      *   The last preceding base character was an uppercase I, and there is no\r
 580      *   intervening combining character class 230 (ABOVE).\r
 581      *\r
 582      *   (See Jitterbug 2344 and the comments on After_I below.)\r
 583      *\r
 584      * Helper definitions in Unicode 3.2 UAX 21:\r
 585      *\r
 586      * D1. A character C is defined to be cased\r
 587      *     if it meets any of the following criteria:\r
 588      *\r
 589      *   - The general category of C is Titlecase Letter (Lt)\r
 590      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase\r
 591      *   - Given D = NFD(C), then it is not the case that:\r
 592      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)\r
 593      *     (This third criterium does not add any characters to the list\r
 594      *      for Unicode 3.2. Ignored.)\r
 595      *\r
 596      * D2. A character C is defined to be case-ignorable\r
 597      *     if it meets either of the following criteria:\r
 598      *\r
 599      *   - The general category of C is\r
 600      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or\r
 601      *     Letter Modifier (Lm), or Symbol Modifier (Sk)\r
 602      *   - C is one of the following characters \r
 603      *     U+0027 APOSTROPHE\r
 604      *     U+00AD SOFT HYPHEN (SHY)\r
 605      *     U+2019 RIGHT SINGLE QUOTATION MARK\r
 606      *            (the preferred character for apostrophe)\r
 607      *\r
 608      * D3. A case-ignorable sequence is a sequence of\r
 609      *     zero or more case-ignorable characters.\r
 610      */\r
 611 \r
 612     /**\r
 613      * Iterator for string case mappings, which need to look at the\r
 614      * context (surrounding text) of a given character for conditional mappings.\r
 615      *\r
 616      * The iterator only needs to go backward or forward away from the\r
 617      * character in question. It does not use any indexes on this interface.\r
 618      * It does not support random access or an arbitrary change of\r
 619      * iteration direction.\r
 620      *\r
 621      * The code point being case-mapped itself is never returned by\r
 622      * this iterator.\r
 623      */\r
 624     public interface ContextIterator {\r
 625         /**\r
 626          * Reset the iterator for forward or backward iteration.\r
 627          * @param dir >0: Begin iterating forward from the first code point\r
 628          * after the one that is being case-mapped.\r
 629          *            <0: Begin iterating backward from the first code point\r
 630          * before the one that is being case-mapped.   \r
 631          */\r
 632         public void reset(int dir);\r
 633         /**\r
 634          * Iterate and return the next code point, moving in the direction\r
 635          * determined by the reset() call.\r
 636          * @return Next code point, or <0 when the iteration is done. \r
 637          */\r
 638         public int next();\r
 639     }\r
 640 \r
 641     /**\r
 642      * For string case mappings, a single character (a code point) is mapped\r
 643      * either to itself (in which case in-place mapping functions do nothing),\r
 644      * or to another single code point, or to a string.\r
 645      * Aside from the string contents, these are indicated with a single int\r
 646      * value as follows:\r
 647      *\r
 648      * Mapping to self: Negative values (~self instead of -self to support U+0000)\r
 649      *\r
 650      * Mapping to another code point: Positive values >MAX_STRING_LENGTH\r
 651      *\r
 652      * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is\r
 653      * returned. Note that the string result may indeed have zero length.\r
 654      */\r
 655     public static final int MAX_STRING_LENGTH=0x1f;\r
 656 \r
 657     private static final int LOC_UNKNOWN=0;\r
 658     private static final int LOC_ROOT=1;\r
 659     private static final int LOC_TURKISH=2;\r
 660     private static final int LOC_LITHUANIAN=3;\r
 661 \r
 662     /*\r
 663      * Checks and caches the type of locale ID as it is relevant for case mapping.\r
 664      * If the locCache is not null, then it must be initialized with locCache[0]=0 .\r
 665      */\r
 666     private static final int getCaseLocale(ULocale locale, int[] locCache) {\r
 667         int result;\r
 668 \r
 669         if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) {\r
 670             return result;\r
 671         }\r
 672 \r
 673         result=LOC_ROOT;\r
 674 \r
 675         String language=locale.getLanguage();\r
 676         if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) {\r
 677             result=LOC_TURKISH;\r
 678         } else if(language.equals("lt") || language.equals("lit")) {\r
 679             result=LOC_LITHUANIAN;\r
 680         }\r
 681 \r
 682         if(locCache!=null) {\r
 683             locCache[0]=result;\r
 684         }\r
 685         return result;\r
 686     }\r
 687 \r
 688     /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */\r
 689     private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {\r
 690         int c;\r
 691         int props;\r
 692 \r
 693         if(iter==null) {\r
 694             return false;\r
 695         }\r
 696 \r
 697         for(iter.reset(dir); (c=iter.next())>=0;) {\r
 698             props=trie.getCodePointValue(c);\r
 699             if(getTypeFromProps(props)!=NONE) {\r
 700                 return true; /* followed by cased letter */\r
 701             } else if(c==0x307 || (props&(EXCEPTION|CASE_IGNORABLE))==CASE_IGNORABLE) {\r
 702                 /* case-ignorable, continue with the loop */\r
 703             } else {\r
 704                 return false; /* not ignorable */\r
 705             }\r
 706         }\r
 707 \r
 708         return false; /* not followed by cased letter */\r
 709     }\r
 710 \r
 711     /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */\r
 712     private final boolean isPrecededBySoftDotted(ContextIterator iter) {\r
 713         int c;\r
 714         int dotType;\r
 715 \r
 716         if(iter==null) {\r
 717             return false;\r
 718         }\r
 719 \r
 720         for(iter.reset(-1); (c=iter.next())>=0;) {\r
 721             dotType=getDotType(c);\r
 722             if(dotType==SOFT_DOTTED) {\r
 723                 return true; /* preceded by TYPE_i */\r
 724             } else if(dotType!=OTHER_ACCENT) {\r
 725                 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */\r
 726             }\r
 727         }\r
 728 \r
 729         return false; /* not preceded by TYPE_i */\r
 730     }\r
 731 \r
 732     /*\r
 733      * See Jitterbug 2344:\r
 734      * The condition After_I for Turkic-lowercasing of U+0307 combining dot above\r
 735      * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because\r
 736      * we made those releases compatible with Unicode 3.2 which had not fixed\r
 737      * a related bug in SpecialCasing.txt.\r
 738      *\r
 739      * From the Jitterbug 2344 text:\r
 740      * ... this bug is listed as a Unicode erratum\r
 741      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html\r
 742      * <quote>\r
 743      * There are two errors in SpecialCasing.txt.\r
 744      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]\r
 745      * 2. An incorrect context definition. Correct as follows:\r
 746      * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE\r
 747      * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE\r
 748      * ---\r
 749      * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\r
 750      * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE\r
 751      * where the context After_I is defined as:\r
 752      * The last preceding base character was an uppercase I, and there is no\r
 753      * intervening combining character class 230 (ABOVE).\r
 754      * </quote>\r
 755      *\r
 756      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:\r
 757      *\r
 758      * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.\r
 759      * # This matches the behavior of the canonically equivalent I-dot_above\r
 760      *\r
 761      * See also the description in this place in older versions of uchar.c (revision 1.100).\r
 762      *\r
 763      * Markus W. Scherer 2003-feb-15\r
 764      */\r
 765 \r
 766     /* Is preceded by base character 'I' with no intervening cc=230 ? */\r
 767     private final boolean isPrecededBy_I(ContextIterator iter) {\r
 768         int c;\r
 769         int dotType;\r
 770 \r
 771         if(iter==null) {\r
 772             return false;\r
 773         }\r
 774 \r
 775         for(iter.reset(-1); (c=iter.next())>=0;) {\r
 776             if(c==0x49) {\r
 777                 return true; /* preceded by I */\r
 778             }\r
 779             dotType=getDotType(c);\r
 780             if(dotType!=OTHER_ACCENT) {\r
 781                 return false; /* preceded by different base character (not I), or intervening cc==230 */\r
 782             }\r
 783         }\r
 784 \r
 785         return false; /* not preceded by I */\r
 786     }\r
 787 \r
 788     /* Is followed by one or more cc==230 ? */\r
 789     private final boolean isFollowedByMoreAbove(ContextIterator iter) {\r
 790         int c;\r
 791         int dotType;\r
 792 \r
 793         if(iter==null) {\r
 794             return false;\r
 795         }\r
 796 \r
 797         for(iter.reset(1); (c=iter.next())>=0;) {\r
 798             dotType=getDotType(c);\r
 799             if(dotType==ABOVE) {\r
 800                 return true; /* at least one cc==230 following */\r
 801             } else if(dotType!=OTHER_ACCENT) {\r
 802                 return false; /* next base character, no more cc==230 following */\r
 803             }\r
 804         }\r
 805 \r
 806         return false; /* no more cc==230 following */\r
 807     }\r
 808 \r
 809     /* Is followed by a dot above (without cc==230 in between) ? */\r
 810     private final boolean isFollowedByDotAbove(ContextIterator iter) {\r
 811         int c;\r
 812         int dotType;\r
 813 \r
 814         if(iter==null) {\r
 815             return false;\r
 816         }\r
 817 \r
 818         for(iter.reset(1); (c=iter.next())>=0; ) {\r
 819             if(c==0x307) {\r
 820                 return true;\r
 821             }\r
 822             dotType=getDotType(c);\r
 823             if(dotType!=OTHER_ACCENT) {\r
 824                 return false; /* next base character or cc==230 in between */\r
 825             }\r
 826         }\r
 827 \r
 828         return false; /* no dot above following */\r
 829     }\r
 830 \r
 831     private static final String\r
 832         iDot=       "i\u0307",\r
 833         jDot=       "j\u0307",\r
 834         iOgonekDot= "\u012f\u0307",\r
 835         iDotGrave=  "i\u0307\u0300",\r
 836         iDotAcute=  "i\u0307\u0301",\r
 837         iDotTilde=  "i\u0307\u0303";\r
 838 \r
 839     /**\r
 840      * Get the full lowercase mapping for c.\r
 841      *\r
 842      * @param c Character to be mapped.\r
 843      * @param iter Character iterator, used for context-sensitive mappings.\r
 844      *             See ContextIterator for details.\r
 845      *             If iter==null then a context-independent result is returned.\r
 846      * @param out If the mapping result is a string, then it is appended to out.\r
 847      * @param locale Locale ID for locale-dependent mappings.\r
 848      * @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing\r
 849      *                 the locale ID for subsequent calls.\r
 850      *                 Can be null.\r
 851      * @return Output code point or string length, see MAX_STRING_LENGTH.\r
 852      *\r
 853      * @see ContextIterator\r
 854      * @see #MAX_STRING_LENGTH\r
 855      * @internal\r
 856      */\r
 857     public final int toFullLower(int c, ContextIterator iter,\r
 858                                  StringBuffer out,\r
 859                                  ULocale locale, int[] locCache) {\r
 860         int result, props;\r
 861 \r
 862         result=c;\r
 863         props=trie.getCodePointValue(c);\r
 864         if(!propsHasException(props)) {\r
 865             if(getTypeFromProps(props)>=UPPER) {\r
 866                 result=c+getDelta(props);\r
 867             }\r
 868         } else {\r
 869             int excOffset=getExceptionsOffset(props), excOffset2;\r
 870             int excWord=exceptions[excOffset++];\r
 871             int full;\r
 872 \r
 873             excOffset2=excOffset;\r
 874 \r
 875             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {\r
 876                 /* use hardcoded conditions and mappings */\r
 877                 int loc=getCaseLocale(locale, locCache);\r
 878 \r
 879                 /*\r
 880                  * Test for conditional mappings first\r
 881                  *   (otherwise the unconditional default mappings are always taken),\r
 882                  * then test for characters that have unconditional mappings in SpecialCasing.txt,\r
 883                  * then get the UnicodeData.txt mappings.\r
 884                  */\r
 885                 if( loc==LOC_LITHUANIAN &&\r
 886                         /* base characters, find accents above */\r
 887                         (((c==0x49 || c==0x4a || c==0x12e) &&\r
 888                             isFollowedByMoreAbove(iter)) ||\r
 889                         /* precomposed with accent above, no need to find one */\r
 890                         (c==0xcc || c==0xcd || c==0x128))\r
 891                 ) {\r
 892                     /*\r
 893                         # Lithuanian\r
 894 \r
 895                         # Lithuanian retains the dot in a lowercase i when followed by accents.\r
 896 \r
 897                         # Introduce an explicit dot above when lowercasing capital I's and J's\r
 898                         # whenever there are more accents above.\r
 899                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)\r
 900 \r
 901                         0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I\r
 902                         004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J\r
 903                         012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK\r
 904                         00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE\r
 905                         00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE\r
 906                         0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE\r
 907                      */\r
 908                     switch(c) {\r
 909                     case 0x49:  /* LATIN CAPITAL LETTER I */\r
 910                         out.append(iDot);\r
 911                         return 2;\r
 912                     case 0x4a:  /* LATIN CAPITAL LETTER J */\r
 913                         out.append(jDot);\r
 914                         return 2;\r
 915                     case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */\r
 916                         out.append(iOgonekDot);\r
 917                         return 2;\r
 918                     case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */\r
 919                         out.append(iDotGrave);\r
 920                         return 3;\r
 921                     case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */\r
 922                         out.append(iDotAcute);\r
 923                         return 3;\r
 924                     case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */\r
 925                         out.append(iDotTilde);\r
 926                         return 3;\r
 927                     default:\r
 928                         return 0; /* will not occur */\r
 929                     }\r
 930                 /* # Turkish and Azeri */\r
 931                 } else if(loc==LOC_TURKISH && c==0x130) {\r
 932                     /*\r
 933                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri\r
 934                         # The following rules handle those cases.\r
 935 \r
 936                         0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE\r
 937                         0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE\r
 938                      */\r
 939                     return 0x69;\r
 940                 } else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {\r
 941                     /*\r
 942                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.\r
 943                         # This matches the behavior of the canonically equivalent I-dot_above\r
 944 \r
 945                         0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\r
 946                         0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE\r
 947                      */\r
 948                     return 0; /* remove the dot (continue without output) */\r
 949                 } else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {\r
 950                     /*\r
 951                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.\r
 952 \r
 953                         0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I\r
 954                         0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I\r
 955                      */\r
 956                     return 0x131;\r
 957                 } else if(c==0x130) {\r
 958                     /*\r
 959                         # Preserve canonical equivalence for I with dot. Turkic is handled below.\r
 960 \r
 961                         0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE\r
 962                      */\r
 963                     out.append(iDot);\r
 964                     return 2;\r
 965                 } else if(  c==0x3a3 &&\r
 966                             !isFollowedByCasedLetter(iter, 1) &&\r
 967                             isFollowedByCasedLetter(iter, -1) /* -1=preceded */\r
 968                 ) {\r
 969                     /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */\r
 970                     /*\r
 971                         # Special case for final form of sigma\r
 972 \r
 973                         03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA\r
 974                      */\r
 975                     return 0x3c2; /* greek small final sigma */\r
 976                 } else {\r
 977                     /* no known conditional special case mapping, use a normal mapping */\r
 978                 }\r
 979             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {\r
 980                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);\r
 981                 full=(int)value&FULL_LOWER;\r
 982                 if(full!=0) {\r
 983                     /* start of full case mapping strings */\r
 984                     excOffset=(int)(value>>32)+1;\r
 985 \r
 986                     /* set the output pointer to the lowercase mapping */\r
 987                     out.append(new String(exceptions, excOffset, full));\r
 988 \r
 989                     /* return the string length */\r
 990                     return full;\r
 991                 }\r
 992             }\r
 993 \r
 994             if(hasSlot(excWord, EXC_LOWER)) {\r
 995                 result=getSlotValue(excWord, EXC_LOWER, excOffset2);\r
 996             }\r
 997         }\r
 998 \r
 999         return (result==c) ? ~result : result;\r
1000     }\r
1001 \r
1002     /* internal */\r
1003     private final int toUpperOrTitle(int c, ContextIterator iter,\r
1004                                      StringBuffer out,\r
1005                                      ULocale locale, int[] locCache,\r
1006                                      boolean upperNotTitle) {\r
1007         int result;\r
1008         int props;\r
1009 \r
1010         result=c;\r
1011         props=trie.getCodePointValue(c);\r
1012         if(!propsHasException(props)) {\r
1013             if(getTypeFromProps(props)==LOWER) {\r
1014                 result=c+getDelta(props);\r
1015             }\r
1016         } else {\r
1017             int excOffset=getExceptionsOffset(props), excOffset2;\r
1018             int excWord=exceptions[excOffset++];\r
1019             int full, index;\r
1020 \r
1021             excOffset2=excOffset;\r
1022 \r
1023             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {\r
1024                 /* use hardcoded conditions and mappings */\r
1025                 int loc=getCaseLocale(locale, locCache);\r
1026 \r
1027                 if(loc==LOC_TURKISH && c==0x69) {\r
1028                     /*\r
1029                         # Turkish and Azeri\r
1030 \r
1031                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri\r
1032                         # The following rules handle those cases.\r
1033 \r
1034                         # When uppercasing, i turns into a dotted capital I\r
1035 \r
1036                         0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I\r
1037                         0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I\r
1038                     */\r
1039                     return 0x130;\r
1040                 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {\r
1041                     /*\r
1042                         # Lithuanian\r
1043 \r
1044                         # Lithuanian retains the dot in a lowercase i when followed by accents.\r
1045 \r
1046                         # Remove DOT ABOVE after "i" with upper or titlecase\r
1047 \r
1048                         0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE\r
1049                      */\r
1050                     return 0; /* remove the dot (continue without output) */\r
1051                 } else {\r
1052                     /* no known conditional special case mapping, use a normal mapping */\r
1053                 }\r
1054             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {\r
1055                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);\r
1056                 full=(int)value&0xffff;\r
1057 \r
1058                 /* start of full case mapping strings */\r
1059                 excOffset=(int)(value>>32)+1;\r
1060 \r
1061                 /* skip the lowercase and case-folding result strings */\r
1062                 excOffset+=full&FULL_LOWER;\r
1063                 full>>=4;\r
1064                 excOffset+=full&0xf;\r
1065                 full>>=4;\r
1066 \r
1067                 if(upperNotTitle) {\r
1068                     full&=0xf;\r
1069                 } else {\r
1070                     /* skip the uppercase result string */\r
1071                     excOffset+=full&0xf;\r
1072                     full=(full>>4)&0xf;\r
1073                 }\r
1074 \r
1075                 if(full!=0) {\r
1076                     /* set the output pointer to the result string */\r
1077                     out.append(new String(exceptions, excOffset, full));\r
1078 \r
1079                     /* return the string length */\r
1080                     return full;\r
1081                 }\r
1082             }\r
1083 \r
1084             if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {\r
1085                 index=EXC_TITLE;\r
1086             } else if(hasSlot(excWord, EXC_UPPER)) {\r
1087                 /* here, titlecase is same as uppercase */\r
1088                 index=EXC_UPPER;\r
1089             } else {\r
1090                 return ~c;\r
1091             }\r
1092             result=getSlotValue(excWord, index, excOffset2);\r
1093         }\r
1094 \r
1095         return (result==c) ? ~result : result;\r
1096     }\r
1097 \r
1098     public final int toFullUpper(int c, ContextIterator iter,\r
1099                                  StringBuffer out,\r
1100                                  ULocale locale, int[] locCache) {\r
1101         return toUpperOrTitle(c, iter, out, locale, locCache, true);\r
1102     }\r
1103 \r
1104     public final int toFullTitle(int c, ContextIterator iter,\r
1105                                  StringBuffer out,\r
1106                                  ULocale locale, int[] locCache) {\r
1107         return toUpperOrTitle(c, iter, out, locale, locCache, false);\r
1108     }\r
1109 \r
1110     /* case folding ------------------------------------------------------------- */\r
1111 \r
1112     /*\r
1113      * Case folding is similar to lowercasing.\r
1114      * The result may be a simple mapping, i.e., a single code point, or\r
1115      * a full mapping, i.e., a string.\r
1116      * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,\r
1117      * then only the lowercase mapping is stored.\r
1118      *\r
1119      * Some special cases are hardcoded because their conditions cannot be\r
1120      * parsed and processed from CaseFolding.txt.\r
1121      *\r
1122      * Unicode 3.2 CaseFolding.txt specifies for its status field:\r
1123 \r
1124     # C: common case folding, common mappings shared by both simple and full mappings.\r
1125     # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.\r
1126     # S: simple case folding, mappings to single characters where different from F.\r
1127     # T: special case for uppercase I and dotted uppercase I\r
1128     #    - For non-Turkic languages, this mapping is normally not used.\r
1129     #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.\r
1130     #\r
1131     # Usage:\r
1132     #  A. To do a simple case folding, use the mappings with status C + S.\r
1133     #  B. To do a full case folding, use the mappings with status C + F.\r
1134     #\r
1135     #    The mappings with status T can be used or omitted depending on the desired case-folding\r
1136     #    behavior. (The default option is to exclude them.)\r
1137 \r
1138      * Unicode 3.2 has 'T' mappings as follows:\r
1139 \r
1140     0049; T; 0131; # LATIN CAPITAL LETTER I\r
1141     0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE\r
1142 \r
1143      * while the default mappings for these code points are:\r
1144 \r
1145     0049; C; 0069; # LATIN CAPITAL LETTER I\r
1146     0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE\r
1147 \r
1148      * U+0130 has no simple case folding (simple-case-folds to itself).\r
1149      */\r
1150 \r
1151     /**\r
1152      * Bit mask for getting just the options from a string compare options word\r
1153      * that are relevant for case folding (of a single string or code point).\r
1154      * @internal\r
1155      */\r
1156     private static final int FOLD_CASE_OPTIONS_MASK = 0xff;\r
1157     \r
1158     /* return the simple case folding mapping for c */\r
1159     public final int fold(int c, int options) {\r
1160         int props=trie.getCodePointValue(c);\r
1161         if(!propsHasException(props)) {\r
1162             if(getTypeFromProps(props)>=UPPER) {\r
1163                 c+=getDelta(props);\r
1164             }\r
1165         } else {\r
1166             int excOffset=getExceptionsOffset(props);\r
1167             int excWord=exceptions[excOffset++];\r
1168             int index;\r
1169             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {\r
1170                 /* special case folding mappings, hardcoded */\r
1171                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {\r
1172                     /* default mappings */\r
1173                     if(c==0x49) {\r
1174                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */\r
1175                         return 0x69;\r
1176                     } else if(c==0x130) {\r
1177                         /* no simple case folding for U+0130 */\r
1178                         return c;\r
1179                     }\r
1180                 } else {\r
1181                     /* Turkic mappings */\r
1182                     if(c==0x49) {\r
1183                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */\r
1184                         return 0x131;\r
1185                     } else if(c==0x130) {\r
1186                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */\r
1187                         return 0x69;\r
1188                     }\r
1189                 }\r
1190             }\r
1191             if(hasSlot(excWord, EXC_FOLD)) {\r
1192                 index=EXC_FOLD;\r
1193             } else if(hasSlot(excWord, EXC_LOWER)) {\r
1194                 index=EXC_LOWER;\r
1195             } else {\r
1196                 return c;\r
1197             }\r
1198             c=getSlotValue(excWord, index, excOffset);\r
1199         }\r
1200         return c;\r
1201     }\r
1202 \r
1203     /*\r
1204      * Issue for canonical caseless match (UAX #21):\r
1205      * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve\r
1206      * canonical equivalence, unlike default-option casefolding.\r
1207      * For example, I-grave and I + grave fold to strings that are not canonically\r
1208      * equivalent.\r
1209      * For more details, see the comment in unorm_compare() in unorm.cpp\r
1210      * and the intermediate prototype changes for Jitterbug 2021.\r
1211      * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)\r
1212      *\r
1213      * This did not get fixed because it appears that it is not possible to fix\r
1214      * it for uppercase and lowercase characters (I-grave vs. i-grave)\r
1215      * together in a way that they still fold to common result strings.\r
1216      */\r
1217 \r
1218     public final int toFullFolding(int c, StringBuffer out, int options) {\r
1219         int result;\r
1220         int props;\r
1221 \r
1222         result=c;\r
1223         props=trie.getCodePointValue(c);\r
1224         if(!propsHasException(props)) {\r
1225             if(getTypeFromProps(props)>=UPPER) {\r
1226                 result=c+getDelta(props);\r
1227             }\r
1228         } else {\r
1229             int excOffset=getExceptionsOffset(props), excOffset2;\r
1230             int excWord=exceptions[excOffset++];\r
1231             int full, index;\r
1232 \r
1233             excOffset2=excOffset;\r
1234 \r
1235             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {\r
1236                 /* use hardcoded conditions and mappings */\r
1237                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {\r
1238                     /* default mappings */\r
1239                     if(c==0x49) {\r
1240                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */\r
1241                         return 0x69;\r
1242                     } else if(c==0x130) {\r
1243                         /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */\r
1244                         out.append(iDot);\r
1245                         return 2;\r
1246                     }\r
1247                 } else {\r
1248                     /* Turkic mappings */\r
1249                     if(c==0x49) {\r
1250                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */\r
1251                         return 0x131;\r
1252                     } else if(c==0x130) {\r
1253                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */\r
1254                         return 0x69;\r
1255                     }\r
1256                 }\r
1257             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {\r
1258                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);\r
1259                 full=(int)value&0xffff;\r
1260 \r
1261                 /* start of full case mapping strings */\r
1262                 excOffset=(int)(value>>32)+1;\r
1263 \r
1264                 /* skip the lowercase result string */\r
1265                 excOffset+=full&FULL_LOWER;\r
1266                 full=(full>>4)&0xf;\r
1267 \r
1268                 if(full!=0) {\r
1269                     /* set the output pointer to the result string */\r
1270                     out.append(new String(exceptions, excOffset, full));\r
1271 \r
1272                     /* return the string length */\r
1273                     return full;\r
1274                 }\r
1275             }\r
1276 \r
1277             if(hasSlot(excWord, EXC_FOLD)) {\r
1278                 index=EXC_FOLD;\r
1279             } else if(hasSlot(excWord, EXC_LOWER)) {\r
1280                 index=EXC_LOWER;\r
1281             } else {\r
1282                 return ~c;\r
1283             }\r
1284             result=getSlotValue(excWord, index, excOffset2);\r
1285         }\r
1286 \r
1287         return (result==c) ? ~result : result;\r
1288     }\r
1289 \r
1290     // data members -------------------------------------------------------- ***\r
1291     private int indexes[];\r
1292     private char exceptions[];\r
1293     private char unfold[];\r
1294 \r
1295     private CharTrie trie;\r
1296 \r
1297     // data format constants ----------------------------------------------- ***\r
1298     private static final String DATA_NAME="ucase";\r
1299     private static final String DATA_TYPE="icu";\r
1300     private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;\r
1301 \r
1302     /* format "cAsE" */\r
1303     private static final byte FMT[]={ 0x63, 0x41, 0x53, 0x45 };\r
1304 \r
1305     /* indexes into indexes[] */\r
1306     private static final int IX_INDEX_TOP=0;\r
1307     //private static final int IX_LENGTH=1;\r
1308     //private static final int IX_TRIE_SIZE=2;\r
1309     private static final int IX_EXC_LENGTH=3;\r
1310     private static final int IX_UNFOLD_LENGTH=4;\r
1311 \r
1312     //private static final int IX_MAX_FULL_LENGTH=15;\r
1313     private static final int IX_TOP=16;\r
1314 \r
1315     // definitions for 16-bit case properties word ------------------------- ***\r
1316 \r
1317     /* 2-bit constants for types of cased characters */\r
1318     public static final int TYPE_MASK=3;\r
1319     public static final int NONE=0;\r
1320     public static final int LOWER=1;\r
1321     public static final int UPPER=2;\r
1322     public static final int TITLE=3;\r
1323 \r
1324     private static final int getTypeFromProps(int props) {\r
1325         return props&TYPE_MASK;\r
1326     }\r
1327 \r
1328     private static final int SENSITIVE=     4;\r
1329     private static final int EXCEPTION=     8;\r
1330 \r
1331     private static final int DOT_MASK=      0x30;\r
1332     //private static final int NO_DOT=        0;      /* normal characters with cc=0 */\r
1333     private static final int SOFT_DOTTED=   0x10;   /* soft-dotted characters with cc=0 */\r
1334     private static final int ABOVE=         0x20;   /* "above" accents with cc=230 */\r
1335     private static final int OTHER_ACCENT=  0x30;   /* other accent character (0<cc!=230) */\r
1336 \r
1337     /* no exception: bits 15..6 are a 10-bit signed case mapping delta */\r
1338     private static final int DELTA_SHIFT=   6;\r
1339     //private static final int DELTA_MASK=    0xffc0;\r
1340     //private static final int MAX_DELTA=     0x1ff;\r
1341     //private static final int MIN_DELTA=     (-MAX_DELTA-1);\r
1342 \r
1343     private static final int getDelta(int props) {\r
1344         return (short)props>>DELTA_SHIFT;\r
1345     }\r
1346 \r
1347     /* case-ignorable uses one of the delta bits, see gencase/store.c */\r
1348     private static final int CASE_IGNORABLE=0x40;\r
1349 \r
1350     /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */\r
1351     private static final int EXC_SHIFT=     4;\r
1352     //private static final int EXC_MASK=      0xfff0;\r
1353     //private static final int MAX_EXCEPTIONS=0x1000;\r
1354 \r
1355     /* definitions for 16-bit main exceptions word ------------------------------ */\r
1356 \r
1357     /* first 8 bits indicate values in optional slots */\r
1358     private static final int EXC_LOWER=0;\r
1359     private static final int EXC_FOLD=1;\r
1360     private static final int EXC_UPPER=2;\r
1361     private static final int EXC_TITLE=3;\r
1362     //private static final int EXC_4=4;           /* reserved */\r
1363     //private static final int EXC_5=5;           /* reserved */\r
1364     private static final int EXC_CLOSURE=6;\r
1365     private static final int EXC_FULL_MAPPINGS=7;\r
1366     //private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */\r
1367 \r
1368     /* each slot is 2 uint16_t instead of 1 */\r
1369     private static final int EXC_DOUBLE_SLOTS=          0x100;\r
1370 \r
1371     /* reserved: exception bits 11..9 */\r
1372 \r
1373     /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */\r
1374     private static final int EXC_DOT_SHIFT=8;\r
1375 \r
1376     /* normally stored in the main word, but pushed out for larger exception indexes */\r
1377     //private static final int EXC_DOT_MASK=              0x3000;\r
1378     //private static final int EXC_NO_DOT=                0;\r
1379     //private static final int EXC_SOFT_DOTTED=           0x1000;\r
1380     //private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */\r
1381     //private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */\r
1382 \r
1383     /* complex/conditional mappings */\r
1384     private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;\r
1385     private static final int EXC_CONDITIONAL_FOLD=      0x8000;\r
1386 \r
1387     /* definitions for lengths word for full case mappings */\r
1388     private static final int FULL_LOWER=    0xf;\r
1389     //private static final int FULL_FOLDING=  0xf0;\r
1390     //private static final int FULL_UPPER=    0xf00;\r
1391     //private static final int FULL_TITLE=    0xf000;\r
1392 \r
1393     /* maximum lengths */\r
1394     //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;\r
1395     private static final int CLOSURE_MAX_LENGTH=0xf;\r
1396 \r
1397     /* constants for reverse case folding ("unfold") data */\r
1398     private static final int UNFOLD_ROWS=0;\r
1399     private static final int UNFOLD_ROW_WIDTH=1;\r
1400     private static final int UNFOLD_STRING_WIDTH=2;\r
1401 }\r