jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/CanonicalIterator.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 import java.util.ArrayList;
  10 import java.util.HashSet;
  11 import java.util.Iterator;
  12 import java.util.List;
  13 import java.util.Set;
  14
  15 import com.ibm.icu.impl.Norm2AllModes;
  16 import com.ibm.icu.impl.Normalizer2Impl;
  17 import com.ibm.icu.impl.Utility;
  18 import com.ibm.icu.lang.UCharacter;
  19
  20 /**
  21  * This class allows one to iterate through all the strings that are canonically equivalent to a given
  22  * string. For example, here are some sample results:
  23  * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
  24  * <pre>
  25  1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
  26  2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
  27  3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
  28  4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
  29  5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
  30  6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
  31  7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
  32  8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
  33  9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
  34 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
  35 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
  36 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
  37  *</pre>
  38  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
  39  * since it has not been optimized for that situation.
  40  * @author M. Davis
  41  * @stable ICU 2.4
  42  */
  43
  44 public final class CanonicalIterator {
  45     /**
  46      * Construct a CanonicalIterator object
  47      * @param source string to get results for
  48      * @stable ICU 2.4
  49      */
  50     public CanonicalIterator(String source) {
  51         Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
  52         nfd = allModes.decomp;
  53         nfcImpl = allModes.impl.ensureCanonIterData();
  54         setSource(source);
  55     }
  56
  57     /**
  58      * Gets the NFD form of the current source we are iterating over.
  59      * @return gets the source: NOTE: it is the NFD form of the source originally passed in
  60      * @stable ICU 2.4
  61      */
  62     public String getSource() {
  63       return source;
  64     }
  65
  66     /**
  67      * Resets the iterator so that one can start again from the beginning.
  68      * @stable ICU 2.4
  69      */
  70     public void reset() {
  71         done = false;
  72         for (int i = 0; i < current.length; ++i) {
  73             current[i] = 0;
  74         }
  75     }
  76
  77     /**
  78      * Get the next canonically equivalent string.
  79      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
  80      * @return the next string that is canonically equivalent. The value null is returned when
  81      * the iteration is done.
  82      * @stable ICU 2.4
  83      */
  84     public String next() {
  85         if (done) return null;
  86
  87         // construct return value
  88
  89         buffer.setLength(0); // delete old contents
  90         for (int i = 0; i < pieces.length; ++i) {
  91             buffer.append(pieces[i][current[i]]);
  92         }
  93         String result = buffer.toString();
  94
  95         // find next value for next time
  96
  97         for (int i = current.length - 1; ; --i) {
  98             if (i < 0) {
  99                 done = true;
 100                 break;
 101             }
 102             current[i]++;
 103             if (current[i] < pieces[i].length) break; // got sequence
 104             current[i] = 0;
 105         }
 106         return result;
 107     }
 108
 109     /**
 110      * Set a new source for this iterator. Allows object reuse.
 111      * @param newSource the source string to iterate against. This allows the same iterator to be used
 112      * while changing the source string, saving object creation.
 113      * @stable ICU 2.4
 114      */
 115     public void setSource(String newSource) {
 116         source = nfd.normalize(newSource);
 117         done = false;
 118
 119         // catch degenerate case
 120         if (newSource.length() == 0) {
 121             pieces = new String[1][];
 122             current = new int[1];
 123             pieces[0] = new String[]{""};
 124             return;
 125         }
 126
 127         // find the segments
 128         List<String> segmentList = new ArrayList<String>();
 129         int cp;
 130         int start = 0;
 131
 132         // i should be the end of the first code point
 133         // break up the string into segements
 134
 135         int i = UTF16.findOffsetFromCodePoint(source, 1);
 136
 137         for (; i < source.length(); i += Character.charCount(cp)) {
 138             cp = source.codePointAt(i);
 139             if (nfcImpl.isCanonSegmentStarter(cp)) {
 140                 segmentList.add(source.substring(start, i)); // add up to i
 141                 start = i;
 142             }
 143         }
 144         segmentList.add(source.substring(start, i)); // add last one
 145
 146         // allocate the arrays, and find the strings that are CE to each segment
 147         pieces = new String[segmentList.size()][];
 148         current = new int[segmentList.size()];
 149         for (i = 0; i < pieces.length; ++i) {
 150             if (PROGRESS) System.out.println("SEGMENT");
 151             pieces[i] = getEquivalents(segmentList.get(i));
 152         }
 153     }
 154
 155     /**
 156      * Simple implementation of permutation.
 157      * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
 158      * @param source the string to find permutations for
 159      * @param skipZeros set to true to skip characters with canonical combining class zero
 160      * @param output the set to add the results to
 161      * @internal
 162      * @deprecated This API is ICU internal only.
 163      */
 164     public static void permute(String source, boolean skipZeros, Set<String> output) {
 165         // TODO: optimize
 166         //if (PROGRESS) System.out.println("Permute: " + source);
 167
 168         // optimization:
 169         // if zero or one character, just return a set with it
 170         // we check for length < 2 to keep from counting code points all the time
 171         if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
 172             output.add(source);
 173             return;
 174         }
 175
 176         // otherwise iterate through the string, and recursively permute all the other characters
 177         Set<String> subpermute = new HashSet<String>();
 178         int cp;
 179         for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
 180             cp = UTF16.charAt(source, i);
 181
 182             // optimization:
 183             // if the character is canonical combining class zero,
 184             // don't permute it
 185             if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
 186                 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
 187                 continue;
 188             }
 189
 190             // see what the permutations of the characters before and after this one are
 191             subpermute.clear();
 192             permute(source.substring(0,i)
 193                 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
 194
 195             // prefix this character to all of them
 196             String chStr = UTF16.valueOf(source, i);
 197             for (String s : subpermute) {
 198                 String piece = chStr + s;
 199                 //if (PROGRESS) System.out.println("  Piece: " + piece);
 200                 output.add(piece);
 201             }
 202         }
 203     }
 204
 205     // FOR TESTING
 206
 207     /*
 208      *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
 209      *
 210     public static UnicodeSet getSafeStart() {
 211         return (UnicodeSet) SAFE_START.clone();
 212     }
 213     */
 214     /*
 215      *@return the set of characters whose decompositions start with the given character
 216      *
 217     public static UnicodeSet getStarts(int cp) {
 218         UnicodeSet result = AT_START.get(cp);
 219         if (result == null) result = EMPTY;
 220         return (UnicodeSet) result.clone();
 221     }
 222     */
 223
 224     // ===================== PRIVATES ==============================
 225
 226     // debug
 227     private static boolean PROGRESS = false; // debug progress
 228     //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
 229     private static boolean SKIP_ZEROS = true;
 230
 231     // fields
 232     private final Normalizer2 nfd;
 233     private final Normalizer2Impl nfcImpl;
 234     private String source;
 235     private boolean done;
 236     private String[][] pieces;
 237     private int[] current;
 238     // Note: C will need two more fields, since arrays there don't have lengths
 239     // int pieces_length;
 240     // int[] pieces_lengths;
 241
 242     // transient fields
 243     private transient StringBuilder buffer = new StringBuilder();
 244
 245
 246     // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
 247     private String[] getEquivalents(String segment) {
 248         Set<String> result = new HashSet<String>();
 249         Set<String> basic = getEquivalents2(segment);
 250         Set<String> permutations = new HashSet<String>();
 251
 252         // now get all the permutations
 253         // add only the ones that are canonically equivalent
 254         // TODO: optimize by not permuting any class zero.
 255         Iterator<String> it = basic.iterator();
 256         while (it.hasNext()) {
 257             String item = it.next();
 258             permutations.clear();
 259             permute(item, SKIP_ZEROS, permutations);
 260             Iterator<String> it2 = permutations.iterator();
 261             while (it2.hasNext()) {
 262                 String possible = it2.next();
 263
 264 /*
 265                 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
 266                 if (attempt.equals(segment)) {
 267 */
 268                 if (Normalizer.compare(possible, segment,0)==0) {
 269
 270                     if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
 271                     result.add(possible);
 272
 273                 } else {
 274                     if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
 275                 }
 276             }
 277         }
 278
 279         // convert into a String[] to clean up storage
 280         String[] finalResult = new String[result.size()];
 281         result.toArray(finalResult);
 282         return finalResult;
 283     }
 284
 285
 286     private Set<String> getEquivalents2(String segment) {
 287
 288         Set<String> result = new HashSet<String>();
 289
 290         if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
 291
 292         result.add(segment);
 293         StringBuffer workingBuffer = new StringBuffer();
 294         UnicodeSet starts = new UnicodeSet();
 295
 296         // cycle through all the characters
 297         int cp;
 298         for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
 299
 300             // see if any character is at the start of some decomposition
 301             cp = segment.codePointAt(i);
 302             if (!nfcImpl.getCanonStartSet(cp, starts)) {
 303               continue;
 304             }
 305             // if so, see which decompositions match
 306             for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
 307                 int cp2 = iter.codepoint;
 308                 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
 309                 if (remainder == null) {
 310                     continue;
 311                 }
 312
 313                 // there were some matches, so add all the possibilities to the set.
 314                 String prefix= segment.substring(0,i);
 315                 prefix += UTF16.valueOf(cp2);
 316                 for (String item : remainder) {
 317                     result.add(prefix + item);
 318                 }
 319             }
 320         }
 321         return result;
 322         /*
 323         Set result = new HashSet();
 324         if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
 325         result.add(segment);
 326         StringBuffer workingBuffer = new StringBuffer();
 327
 328         // cycle through all the characters
 329         int cp;
 330
 331         for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
 332             // see if any character is at the start of some decomposition
 333             cp = UTF16.charAt(segment, i);
 334             NormalizerImpl.getCanonStartSet(c,fillSet)
 335             UnicodeSet starts = AT_START.get(cp);
 336             if (starts == null) continue;
 337             UnicodeSetIterator usi = new UnicodeSetIterator(starts);
 338             // if so, see which decompositions match
 339             while (usi.next()) {
 340                 int cp2 = usi.codepoint;
 341                 // we know that there are no strings in it
 342                 // so we don't have to check CharacterIterator.IS_STRING
 343                 Set remainder = extract(cp2, segment, i, workingBuffer);
 344                 if (remainder == null) continue;
 345
 346                 // there were some matches, so add all the possibilities to the set.
 347                 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
 348                 Iterator it = remainder.iterator();
 349                 while (it.hasNext()) {
 350                     String item = (String) it.next();
 351                     if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
 352                     result.add(prefix + item);
 353                 }
 354             }
 355         }
 356         return result;
 357         */
 358     }
 359
 360     /**
 361      * See if the decomposition of cp2 is at segment starting at segmentPos
 362      * (with canonical rearrangment!)
 363      * If so, take the remainder, and return the equivalents
 364      */
 365     private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
 366         if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
 367             + ", " + Utility.hex(segment.substring(segmentPos)));
 368
 369         String decomp = nfcImpl.getDecomposition(comp);
 370         if (decomp == null) {
 371             decomp = UTF16.valueOf(comp);
 372         }
 373
 374         // See if it matches the start of segment (at segmentPos)
 375         boolean ok = false;
 376         int cp;
 377         int decompPos = 0;
 378         int decompCp = UTF16.charAt(decomp,0);
 379         decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
 380         //int decompClass = getClass(decompCp);
 381         buf.setLength(0); // initialize working buffer, shared among callees
 382
 383         for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
 384             cp = UTF16.charAt(segment, i);
 385             if (cp == decompCp) { // if equal, eat another cp from decomp
 386                 if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
 387                 if (decompPos == decomp.length()) { // done, have all decomp characters!
 388                     buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
 389                     ok = true;
 390                     break;
 391                 }
 392                 decompCp = UTF16.charAt(decomp, decompPos);
 393                 decompPos += UTF16.getCharCount(decompCp);
 394                 //decompClass = getClass(decompCp);
 395             } else {
 396                 if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
 397                 // brute force approach
 398                 UTF16.append(buf, cp);
 399                 /* TODO: optimize
 400                 // since we know that the classes are monotonically increasing, after zero
 401                 // e.g. 0 5 7 9 0 3
 402                 // we can do an optimization
 403                 // there are only a few cases that work: zero, less, same, greater
 404                 // if both classes are the same, we fail
 405                 // if the decomp class < the segment class, we fail
 406
 407                 segClass = getClass(cp);
 408                 if (decompClass <= segClass) return null;
 409                 */
 410             }
 411         }
 412         if (!ok) return null; // we failed, characters left over
 413         if (PROGRESS) System.out.println("Matches");
 414         if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
 415         String remainder = buf.toString();
 416
 417         // brute force approach
 418         // to check to make sure result is canonically equivalent
 419         /*
 420         String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
 421         if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
 422         */
 423
 424         if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
 425
 426         // get the remaining combinations
 427         return getEquivalents2(remainder);
 428     }
 429
 430     /*
 431     // TODO: fix once we have a codepoint interface to get the canonical combining class
 432     // TODO: Need public access to canonical combining class in UCharacter!
 433     private static int getClass(int cp) {
 434         return Normalizer.getClass((char)cp);
 435     }
 436     */
 437
 438    // ================= BUILDER =========================
 439     // TODO: Flatten this data so it doesn't have to be reconstructed each time!
 440
 441     //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
 442     private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
 443     static {
 444         SET_WITH_NULL_STRING.add("");
 445     }
 446
 447   //  private static UnicodeSet SAFE_START = new UnicodeSet();
 448   //  private static CharMap AT_START = new CharMap();
 449
 450         // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
 451         // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
 452   //  private static int LAST_UNICODE = 0x10FFFF;
 453     /*
 454     static {
 455         buildData();
 456     }
 457     */
 458     /*
 459     private static void buildData() {
 460
 461         if (PROGRESS) System.out.println("Getting Safe Start");
 462         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
 463             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
 464             int cc = UCharacter.getCombiningClass(cp);
 465             if (cc == 0) SAFE_START.add(cp);
 466             // will fix to be really safe below
 467         }
 468         if (PROGRESS) System.out.println();
 469
 470         if (PROGRESS) System.out.println("Getting Containment");
 471         for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
 472             if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
 473
 474             if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
 475
 476             //String istr = UTF16.valueOf(cp);
 477             String decomp = Normalizer.normalize(cp, Normalizer.NFD);
 478             //if (decomp.equals(istr)) continue;
 479
 480             // add each character in the decomposition to canBeIn
 481
 482             int component;
 483             for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
 484                 component = UTF16.charAt(decomp, i);
 485                 if (i == 0) {
 486                     AT_START.add(component, cp);
 487                 } else if (UCharacter.getCombiningClass(component) == 0) {
 488                     SAFE_START.remove(component);
 489                 }
 490             }
 491         }
 492         if (PROGRESS) System.out.println();
 493     }
 494         // the following is just for a map from characters to a set of characters
 495
 496     private static class CharMap {
 497         Map storage = new HashMap();
 498         MutableInt probe = new MutableInt();
 499         boolean converted = false;
 500
 501         public void add(int cp, int whatItIsIn) {
 502             UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
 503             if (result == null) {
 504                 result = new UnicodeSet();
 505                 storage.put(probe, result);
 506             }
 507             result.add(whatItIsIn);
 508         }
 509
 510         public UnicodeSet get(int cp) {
 511             return (UnicodeSet) storage.get(probe.set(cp));
 512         }
 513     }
 514
 515     private static class MutableInt {
 516         public int contents;
 517         public int hashCode() { return contents; }
 518         public boolean equals(Object other) {
 519             return ((MutableInt)other).contents == contents;
 520         }
 521         // allows chaining
 522         public MutableInt set(int contents) {
 523             this.contents = contents;
 524             return this;
 525         }
 526     }
 527     */
 528
 529 }