jars/icu4j-4_2_1-src/src/com/ibm/icu/text/CanonicalIterator.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2008, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import com.ibm.icu.lang.*;\r
  10 import java.util.*;\r
  11 import com.ibm.icu.impl.NormalizerImpl;\r
  12 import com.ibm.icu.impl.USerializedSet;\r
  13 import com.ibm.icu.impl.Utility;\r
  14 \r
  15 /**\r
  16  * This class allows one to iterate through all the strings that are canonically equivalent to a given\r
  17  * string. For example, here are some sample results:\r
  18  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}\r
  19  * <pre>\r
  20  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}\r
  21  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}\r
  22  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}\r
  23  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}\r
  24  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}\r
  25  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}\r
  26  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}\r
  27  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}\r
  28  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}\r
  29 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}\r
  30 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}\r
  31 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}\r
  32  *</pre>\r
  33  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,\r
  34  * since it has not been optimized for that situation.\r
  35  * @author M. Davis\r
  36  * @stable ICU 2.4\r
  37  */\r
  38 \r
  39 public final class CanonicalIterator {\r
  40     /**\r
  41      * Construct a CanonicalIterator object\r
  42      * @param source string to get results for\r
  43      * @stable ICU 2.4\r
  44      */\r
  45     public CanonicalIterator(String source) {\r
  46         setSource(source);\r
  47     }\r
  48 \r
  49     /**\r
  50      * Gets the NFD form of the current source we are iterating over.\r
  51      * @return gets the source: NOTE: it is the NFD form of the source originally passed in\r
  52      * @stable ICU 2.4\r
  53      */\r
  54     public String getSource() {\r
  55       return source;\r
  56     }\r
  57 \r
  58     /**\r
  59      * Resets the iterator so that one can start again from the beginning.\r
  60      * @stable ICU 2.4\r
  61      */\r
  62     public void reset() {\r
  63         done = false;\r
  64         for (int i = 0; i < current.length; ++i) {\r
  65             current[i] = 0;\r
  66         }\r
  67     }\r
  68 \r
  69     /**\r
  70      * Get the next canonically equivalent string.\r
  71      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>\r
  72      * @return the next string that is canonically equivalent. The value null is returned when\r
  73      * the iteration is done.\r
  74      * @stable ICU 2.4\r
  75      */\r
  76     public String next() {\r
  77         if (done) return null;\r
  78 \r
  79         // construct return value\r
  80 \r
  81         buffer.setLength(0); // delete old contents\r
  82         for (int i = 0; i < pieces.length; ++i) {\r
  83             buffer.append(pieces[i][current[i]]);\r
  84         }\r
  85         String result = buffer.toString();\r
  86 \r
  87         // find next value for next time\r
  88 \r
  89         for (int i = current.length - 1; ; --i) {\r
  90             if (i < 0) {\r
  91                 done = true;\r
  92                 break;\r
  93             }\r
  94             current[i]++;\r
  95             if (current[i] < pieces[i].length) break; // got sequence\r
  96             current[i] = 0;\r
  97         }\r
  98         return result;\r
  99     }\r
 100 \r
 101     /**\r
 102      * Set a new source for this iterator. Allows object reuse.\r
 103      * @param newSource the source string to iterate against. This allows the same iterator to be used\r
 104      * while changing the source string, saving object creation.\r
 105      * @stable ICU 2.4\r
 106      */\r
 107     public void setSource(String newSource) {\r
 108         source = Normalizer.normalize(newSource, Normalizer.NFD);\r
 109         done = false;\r
 110 \r
 111         // catch degenerate case\r
 112         if (newSource.length() == 0) {\r
 113             pieces = new String[1][];\r
 114             current = new int[1];\r
 115             pieces[0] = new String[]{""};\r
 116             return;\r
 117         }\r
 118 \r
 119         // find the segments\r
 120         List segmentList = new ArrayList();\r
 121         int cp;\r
 122         int start = 0;\r
 123 \r
 124         // i should be the end of the first code point\r
 125         // break up the string into segements\r
 126 \r
 127         int i = UTF16.findOffsetFromCodePoint(source, 1);\r
 128 \r
 129         for (; i < source.length(); i += UTF16.getCharCount(cp)) {\r
 130             cp = UTF16.charAt(source, i);\r
 131             if (NormalizerImpl.isCanonSafeStart(cp)) {\r
 132                 segmentList.add(source.substring(start, i)); // add up to i\r
 133                 start = i;\r
 134             }\r
 135         }\r
 136         segmentList.add(source.substring(start, i)); // add last one\r
 137 \r
 138         // allocate the arrays, and find the strings that are CE to each segment\r
 139         pieces = new String[segmentList.size()][];\r
 140         current = new int[segmentList.size()];\r
 141         for (i = 0; i < pieces.length; ++i) {\r
 142             if (PROGRESS) System.out.println("SEGMENT");\r
 143             pieces[i] = getEquivalents((String) segmentList.get(i));\r
 144         }\r
 145     }\r
 146 \r
 147     /**\r
 148      * Simple implementation of permutation.\r
 149      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>\r
 150      * @param source the string to find permutations for\r
 151      * @param skipZeros set to true to skip characters with canonical combining class zero\r
 152      * @param output the set to add the results to\r
 153      * @internal\r
 154      * @deprecated This API is ICU internal only.\r
 155      */\r
 156     public static void permute(String source, boolean skipZeros, Set output) {\r
 157         // TODO: optimize\r
 158         //if (PROGRESS) System.out.println("Permute: " + source);\r
 159 \r
 160         // optimization:\r
 161         // if zero or one character, just return a set with it\r
 162         // we check for length < 2 to keep from counting code points all the time\r
 163         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {\r
 164             output.add(source);\r
 165             return;\r
 166         }\r
 167 \r
 168         // otherwise iterate through the string, and recursively permute all the other characters\r
 169         Set subpermute = new HashSet();\r
 170         int cp;\r
 171         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {\r
 172             cp = UTF16.charAt(source, i);\r
 173 \r
 174             // optimization:\r
 175             // if the character is canonical combining class zero,\r
 176             // don't permute it\r
 177             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {\r
 178                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));\r
 179                 continue;\r
 180             }\r
 181 \r
 182             // see what the permutations of the characters before and after this one are\r
 183             subpermute.clear();\r
 184             permute(source.substring(0,i)\r
 185                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);\r
 186 \r
 187             // prefix this character to all of them\r
 188             String chStr = UTF16.valueOf(source, i);\r
 189             Iterator it = subpermute.iterator();\r
 190             while (it.hasNext()) {\r
 191                 String piece = chStr + (String) it.next();\r
 192                 //if (PROGRESS) System.out.println("  Piece: " + piece);\r
 193                 output.add(piece);\r
 194             }\r
 195         }\r
 196     }\r
 197 \r
 198     // FOR TESTING\r
 199 \r
 200     /*\r
 201      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.\r
 202      *@internal\r
 203      *\r
 204     public static UnicodeSet getSafeStart() {\r
 205         return (UnicodeSet) SAFE_START.clone();\r
 206     }\r
 207     */\r
 208     /*\r
 209      *@return the set of characters whose decompositions start with the given character\r
 210      *@internal\r
 211      *\r
 212     public static UnicodeSet getStarts(int cp) {\r
 213         UnicodeSet result = AT_START.get(cp);\r
 214         if (result == null) result = EMPTY;\r
 215         return (UnicodeSet) result.clone();\r
 216     }\r
 217     */\r
 218 \r
 219     // ===================== PRIVATES ==============================\r
 220 \r
 221     // debug\r
 222     private static boolean PROGRESS = false; // debug progress\r
 223     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;\r
 224     private static boolean SKIP_ZEROS = true;\r
 225 \r
 226     // fields\r
 227     private String source;\r
 228     private boolean done;\r
 229     private String[][] pieces;\r
 230     private int[] current;\r
 231     // Note: C will need two more fields, since arrays there don't have lengths\r
 232     // int pieces_length;\r
 233     // int[] pieces_lengths;\r
 234 \r
 235     // transient fields\r
 236     private transient StringBuffer buffer = new StringBuffer();\r
 237 \r
 238 \r
 239     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.\r
 240     private String[] getEquivalents(String segment) {\r
 241         Set result = new HashSet();\r
 242         Set basic = getEquivalents2(segment);\r
 243         Set permutations = new HashSet();\r
 244 \r
 245         // now get all the permutations\r
 246         // add only the ones that are canonically equivalent\r
 247         // TODO: optimize by not permuting any class zero.\r
 248         Iterator it = basic.iterator();\r
 249         while (it.hasNext()) {\r
 250             String item = (String) it.next();\r
 251             permutations.clear();\r
 252             permute(item, SKIP_ZEROS, permutations);\r
 253             Iterator it2 = permutations.iterator();\r
 254             while (it2.hasNext()) {\r
 255                 String possible = (String) it2.next();\r
 256 \r
 257 /*\r
 258                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);\r
 259                 if (attempt.equals(segment)) {\r
 260 */\r
 261                 if (Normalizer.compare(possible, segment,0)==0) {\r
 262 \r
 263                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));\r
 264                     result.add(possible);\r
 265 \r
 266                 } else {\r
 267                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));\r
 268                 }\r
 269             }\r
 270         }\r
 271 \r
 272         // convert into a String[] to clean up storage\r
 273         String[] finalResult = new String[result.size()];\r
 274         result.toArray(finalResult);\r
 275         return finalResult;\r
 276     }\r
 277 \r
 278 \r
 279     private Set getEquivalents2(String segment) {\r
 280 \r
 281         Set result = new HashSet();\r
 282 \r
 283         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));\r
 284 \r
 285         result.add(segment);\r
 286         StringBuffer workingBuffer = new StringBuffer();\r
 287 \r
 288         // cycle through all the characters\r
 289         int cp=0;\r
 290         int[] range = new int[2];\r
 291         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {\r
 292 \r
 293             // see if any character is at the start of some decomposition\r
 294             cp = UTF16.charAt(segment, i);\r
 295             USerializedSet starts = new USerializedSet();\r
 296 \r
 297             if (!NormalizerImpl.getCanonStartSet(cp, starts)) {\r
 298               continue;\r
 299             }\r
 300             int j=0;\r
 301             // if so, see which decompositions match\r
 302             int rangeCount = starts.countRanges();\r
 303             for(j = 0; j < rangeCount; ++j) {\r
 304                 starts.getRange(j, range);\r
 305                 int end=range[1];\r
 306                 for (int cp2 = range[0]; cp2 <= end; ++cp2) {\r
 307                     Set remainder = extract(cp2, segment, i, workingBuffer);\r
 308                     if (remainder == null) continue;\r
 309 \r
 310                     // there were some matches, so add all the possibilities to the set.\r
 311                     String prefix= segment.substring(0,i);\r
 312                     prefix += UTF16.valueOf(cp2);\r
 313                     //int el = -1;\r
 314                     Iterator iter = remainder.iterator();\r
 315                     while (iter.hasNext()) {\r
 316                         String item = (String) iter.next();\r
 317                         String toAdd = new String(prefix);\r
 318                         toAdd += item;\r
 319                         result.add(toAdd);\r
 320                         //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));\r
 321                     }\r
 322                 }\r
 323             }\r
 324         }\r
 325         return result;\r
 326         /*\r
 327         Set result = new HashSet();\r
 328         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));\r
 329         result.add(segment);\r
 330         StringBuffer workingBuffer = new StringBuffer();\r
 331 \r
 332         // cycle through all the characters\r
 333         int cp;\r
 334 \r
 335         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {\r
 336             // see if any character is at the start of some decomposition\r
 337             cp = UTF16.charAt(segment, i);\r
 338             NormalizerImpl.getCanonStartSet(c,fillSet)\r
 339             UnicodeSet starts = AT_START.get(cp);\r
 340             if (starts == null) continue;\r
 341             UnicodeSetIterator usi = new UnicodeSetIterator(starts);\r
 342             // if so, see which decompositions match\r
 343             while (usi.next()) {\r
 344                 int cp2 = usi.codepoint;\r
 345                 // we know that there are no strings in it\r
 346                 // so we don't have to check CharacterIterator.IS_STRING\r
 347                 Set remainder = extract(cp2, segment, i, workingBuffer);\r
 348                 if (remainder == null) continue;\r
 349 \r
 350                 // there were some matches, so add all the possibilities to the set.\r
 351                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);\r
 352                 Iterator it = remainder.iterator();\r
 353                 while (it.hasNext()) {\r
 354                     String item = (String) it.next();\r
 355                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));\r
 356                     result.add(prefix + item);\r
 357                 }\r
 358             }\r
 359         }\r
 360         return result;\r
 361         */\r
 362     }\r
 363 \r
 364     /**\r
 365      * See if the decomposition of cp2 is at segment starting at segmentPos\r
 366      * (with canonical rearrangment!)\r
 367      * If so, take the remainder, and return the equivalents\r
 368      */\r
 369     private Set extract(int comp, String segment, int segmentPos, StringBuffer buf) {\r
 370         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))\r
 371             + ", " + Utility.hex(segment.substring(segmentPos)));\r
 372 \r
 373         //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);\r
 374         String decomp = Normalizer.normalize(comp, Normalizer.NFD);\r
 375 \r
 376         // See if it matches the start of segment (at segmentPos)\r
 377         boolean ok = false;\r
 378         int cp;\r
 379         int decompPos = 0;\r
 380         int decompCp = UTF16.charAt(decomp,0);\r
 381         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char\r
 382         //int decompClass = getClass(decompCp);\r
 383         buf.setLength(0); // initialize working buffer, shared among callees\r
 384 \r
 385         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {\r
 386             cp = UTF16.charAt(segment, i);\r
 387             if (cp == decompCp) { // if equal, eat another cp from decomp\r
 388                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));\r
 389                 if (decompPos == decomp.length()) { // done, have all decomp characters!\r
 390                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars\r
 391                     ok = true;\r
 392                     break;\r
 393                 }\r
 394                 decompCp = UTF16.charAt(decomp, decompPos);\r
 395                 decompPos += UTF16.getCharCount(decompCp);\r
 396                 //decompClass = getClass(decompCp);\r
 397             } else {\r
 398                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));\r
 399                 // brute force approach\r
 400                 UTF16.append(buf, cp);\r
 401                 /* TODO: optimize\r
 402                 // since we know that the classes are monotonically increasing, after zero\r
 403                 // e.g. 0 5 7 9 0 3\r
 404                 // we can do an optimization\r
 405                 // there are only a few cases that work: zero, less, same, greater\r
 406                 // if both classes are the same, we fail\r
 407                 // if the decomp class < the segment class, we fail\r
 408 \r
 409                 segClass = getClass(cp);\r
 410                 if (decompClass <= segClass) return null;\r
 411                 */\r
 412             }\r
 413         }\r
 414         if (!ok) return null; // we failed, characters left over\r
 415         if (PROGRESS) System.out.println("Matches");\r
 416         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder\r
 417         String remainder = buf.toString();\r
 418 \r
 419         // brute force approach\r
 420         // to check to make sure result is canonically equivalent\r
 421         /*\r
 422         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);\r
 423         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;\r
 424         */\r
 425 \r
 426         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;\r
 427 \r
 428         // get the remaining combinations\r
 429         return getEquivalents2(remainder);\r
 430     }\r
 431 \r
 432     /*\r
 433     // TODO: fix once we have a codepoint interface to get the canonical combining class\r
 434     // TODO: Need public access to canonical combining class in UCharacter!\r
 435     private static int getClass(int cp) {\r
 436         return Normalizer.getClass((char)cp);\r
 437     }\r
 438     */\r
 439 \r
 440    // ================= BUILDER =========================\r
 441     // TODO: Flatten this data so it doesn't have to be reconstructed each time!\r
 442 \r
 443     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change\r
 444     private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change\r
 445     static {\r
 446         SET_WITH_NULL_STRING.add("");\r
 447     }\r
 448 \r
 449   //  private static UnicodeSet SAFE_START = new UnicodeSet();\r
 450   //  private static CharMap AT_START = new CharMap();\r
 451 \r
 452         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;\r
 453         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.\r
 454   //  private static int LAST_UNICODE = 0x10FFFF;\r
 455     /*\r
 456     static {\r
 457         buildData();\r
 458     }\r
 459     */\r
 460     /*\r
 461     private static void buildData() {\r
 462 \r
 463         if (PROGRESS) System.out.println("Getting Safe Start");\r
 464         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {\r
 465             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');\r
 466             int cc = UCharacter.getCombiningClass(cp);\r
 467             if (cc == 0) SAFE_START.add(cp);\r
 468             // will fix to be really safe below\r
 469         }\r
 470         if (PROGRESS) System.out.println();\r
 471 \r
 472         if (PROGRESS) System.out.println("Getting Containment");\r
 473         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {\r
 474             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');\r
 475 \r
 476             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;\r
 477 \r
 478             //String istr = UTF16.valueOf(cp);\r
 479             String decomp = Normalizer.normalize(cp, Normalizer.NFD);\r
 480             //if (decomp.equals(istr)) continue;\r
 481 \r
 482             // add each character in the decomposition to canBeIn\r
 483 \r
 484             int component;\r
 485             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {\r
 486                 component = UTF16.charAt(decomp, i);\r
 487                 if (i == 0) {\r
 488                     AT_START.add(component, cp);\r
 489                 } else if (UCharacter.getCombiningClass(component) == 0) {\r
 490                     SAFE_START.remove(component);\r
 491                 }\r
 492             }\r
 493         }\r
 494         if (PROGRESS) System.out.println();\r
 495     }\r
 496         // the following is just for a map from characters to a set of characters\r
 497 \r
 498     private static class CharMap {\r
 499         Map storage = new HashMap();\r
 500         MutableInt probe = new MutableInt();\r
 501         boolean converted = false;\r
 502 \r
 503         public void add(int cp, int whatItIsIn) {\r
 504             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));\r
 505             if (result == null) {\r
 506                 result = new UnicodeSet();\r
 507                 storage.put(probe, result);\r
 508             }\r
 509             result.add(whatItIsIn);\r
 510         }\r
 511 \r
 512         public UnicodeSet get(int cp) {\r
 513             return (UnicodeSet) storage.get(probe.set(cp));\r
 514         }\r
 515     }\r
 516 \r
 517     private static class MutableInt {\r
 518         public int contents;\r
 519         public int hashCode() { return contents; }\r
 520         public boolean equals(Object other) {\r
 521             return ((MutableInt)other).contents == contents;\r
 522         }\r
 523         // allows chaining\r
 524         public MutableInt set(int contents) {\r
 525             this.contents = contents;\r
 526             return this;\r
 527         }\r
 528     }\r
 529     */\r
 530 \r
 531 }\r