2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.util.ArrayList;
10 import java.util.HashSet;
11 import java.util.Iterator;
12 import java.util.List;
15 import com.ibm.icu.impl.Norm2AllModes;
16 import com.ibm.icu.impl.Normalizer2Impl;
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.lang.UCharacter;
21 * This class allows one to iterate through all the strings that are canonically equivalent to a given
22 * string. For example, here are some sample results:
23 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
25 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
26 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
27 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
28 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
29 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
30 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
31 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
32 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
33 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
34 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
35 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
36 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
38 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
39 * since it has not been optimized for that situation.
44 public final class CanonicalIterator {
46 * Construct a CanonicalIterator object
47 * @param source string to get results for
50 public CanonicalIterator(String source) {
51 Norm2AllModes allModes = Norm2AllModes.getNFCInstance();
52 nfd = allModes.decomp;
53 nfcImpl = allModes.impl.ensureCanonIterData();
58 * Gets the NFD form of the current source we are iterating over.
59 * @return gets the source: NOTE: it is the NFD form of the source originally passed in
62 public String getSource() {
67 * Resets the iterator so that one can start again from the beginning.
72 for (int i = 0; i < current.length; ++i) {
78 * Get the next canonically equivalent string.
79 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
80 * @return the next string that is canonically equivalent. The value null is returned when
81 * the iteration is done.
84 public String next() {
85 if (done) return null;
87 // construct return value
89 buffer.setLength(0); // delete old contents
90 for (int i = 0; i < pieces.length; ++i) {
91 buffer.append(pieces[i][current[i]]);
93 String result = buffer.toString();
95 // find next value for next time
97 for (int i = current.length - 1; ; --i) {
103 if (current[i] < pieces[i].length) break; // got sequence
110 * Set a new source for this iterator. Allows object reuse.
111 * @param newSource the source string to iterate against. This allows the same iterator to be used
112 * while changing the source string, saving object creation.
115 public void setSource(String newSource) {
116 source = nfd.normalize(newSource);
119 // catch degenerate case
120 if (newSource.length() == 0) {
121 pieces = new String[1][];
122 current = new int[1];
123 pieces[0] = new String[]{""};
128 List<String> segmentList = new ArrayList<String>();
132 // i should be the end of the first code point
133 // break up the string into segements
135 int i = UTF16.findOffsetFromCodePoint(source, 1);
137 for (; i < source.length(); i += Character.charCount(cp)) {
138 cp = source.codePointAt(i);
139 if (nfcImpl.isCanonSegmentStarter(cp)) {
140 segmentList.add(source.substring(start, i)); // add up to i
144 segmentList.add(source.substring(start, i)); // add last one
146 // allocate the arrays, and find the strings that are CE to each segment
147 pieces = new String[segmentList.size()][];
148 current = new int[segmentList.size()];
149 for (i = 0; i < pieces.length; ++i) {
150 if (PROGRESS) System.out.println("SEGMENT");
151 pieces[i] = getEquivalents(segmentList.get(i));
156 * Simple implementation of permutation.
157 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
158 * @param source the string to find permutations for
159 * @param skipZeros set to true to skip characters with canonical combining class zero
160 * @param output the set to add the results to
162 * @deprecated This API is ICU internal only.
164 public static void permute(String source, boolean skipZeros, Set<String> output) {
166 //if (PROGRESS) System.out.println("Permute: " + source);
169 // if zero or one character, just return a set with it
170 // we check for length < 2 to keep from counting code points all the time
171 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
176 // otherwise iterate through the string, and recursively permute all the other characters
177 Set<String> subpermute = new HashSet<String>();
179 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
180 cp = UTF16.charAt(source, i);
183 // if the character is canonical combining class zero,
185 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
186 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
190 // see what the permutations of the characters before and after this one are
192 permute(source.substring(0,i)
193 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
195 // prefix this character to all of them
196 String chStr = UTF16.valueOf(source, i);
197 for (String s : subpermute) {
198 String piece = chStr + s;
199 //if (PROGRESS) System.out.println(" Piece: " + piece);
208 *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
210 public static UnicodeSet getSafeStart() {
211 return (UnicodeSet) SAFE_START.clone();
215 *@return the set of characters whose decompositions start with the given character
217 public static UnicodeSet getStarts(int cp) {
218 UnicodeSet result = AT_START.get(cp);
219 if (result == null) result = EMPTY;
220 return (UnicodeSet) result.clone();
224 // ===================== PRIVATES ==============================
227 private static boolean PROGRESS = false; // debug progress
228 //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
229 private static boolean SKIP_ZEROS = true;
232 private final Normalizer2 nfd;
233 private final Normalizer2Impl nfcImpl;
234 private String source;
235 private boolean done;
236 private String[][] pieces;
237 private int[] current;
238 // Note: C will need two more fields, since arrays there don't have lengths
239 // int pieces_length;
240 // int[] pieces_lengths;
243 private transient StringBuilder buffer = new StringBuilder();
246 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
247 private String[] getEquivalents(String segment) {
248 Set<String> result = new HashSet<String>();
249 Set<String> basic = getEquivalents2(segment);
250 Set<String> permutations = new HashSet<String>();
252 // now get all the permutations
253 // add only the ones that are canonically equivalent
254 // TODO: optimize by not permuting any class zero.
255 Iterator<String> it = basic.iterator();
256 while (it.hasNext()) {
257 String item = it.next();
258 permutations.clear();
259 permute(item, SKIP_ZEROS, permutations);
260 Iterator<String> it2 = permutations.iterator();
261 while (it2.hasNext()) {
262 String possible = it2.next();
265 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
266 if (attempt.equals(segment)) {
268 if (Normalizer.compare(possible, segment,0)==0) {
270 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
271 result.add(possible);
274 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
279 // convert into a String[] to clean up storage
280 String[] finalResult = new String[result.size()];
281 result.toArray(finalResult);
286 private Set<String> getEquivalents2(String segment) {
288 Set<String> result = new HashSet<String>();
290 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
293 StringBuffer workingBuffer = new StringBuffer();
294 UnicodeSet starts = new UnicodeSet();
296 // cycle through all the characters
298 for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
300 // see if any character is at the start of some decomposition
301 cp = segment.codePointAt(i);
302 if (!nfcImpl.getCanonStartSet(cp, starts)) {
305 // if so, see which decompositions match
306 for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
307 int cp2 = iter.codepoint;
308 Set<String> remainder = extract(cp2, segment, i, workingBuffer);
309 if (remainder == null) {
313 // there were some matches, so add all the possibilities to the set.
314 String prefix= segment.substring(0,i);
315 prefix += UTF16.valueOf(cp2);
316 for (String item : remainder) {
317 result.add(prefix + item);
323 Set result = new HashSet();
324 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
326 StringBuffer workingBuffer = new StringBuffer();
328 // cycle through all the characters
331 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
332 // see if any character is at the start of some decomposition
333 cp = UTF16.charAt(segment, i);
334 NormalizerImpl.getCanonStartSet(c,fillSet)
335 UnicodeSet starts = AT_START.get(cp);
336 if (starts == null) continue;
337 UnicodeSetIterator usi = new UnicodeSetIterator(starts);
338 // if so, see which decompositions match
340 int cp2 = usi.codepoint;
341 // we know that there are no strings in it
342 // so we don't have to check CharacterIterator.IS_STRING
343 Set remainder = extract(cp2, segment, i, workingBuffer);
344 if (remainder == null) continue;
346 // there were some matches, so add all the possibilities to the set.
347 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
348 Iterator it = remainder.iterator();
349 while (it.hasNext()) {
350 String item = (String) it.next();
351 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
352 result.add(prefix + item);
361 * See if the decomposition of cp2 is at segment starting at segmentPos
362 * (with canonical rearrangment!)
363 * If so, take the remainder, and return the equivalents
365 private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) {
366 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
367 + ", " + Utility.hex(segment.substring(segmentPos)));
369 String decomp = nfcImpl.getDecomposition(comp);
370 if (decomp == null) {
371 decomp = UTF16.valueOf(comp);
374 // See if it matches the start of segment (at segmentPos)
378 int decompCp = UTF16.charAt(decomp,0);
379 decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
380 //int decompClass = getClass(decompCp);
381 buf.setLength(0); // initialize working buffer, shared among callees
383 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
384 cp = UTF16.charAt(segment, i);
385 if (cp == decompCp) { // if equal, eat another cp from decomp
386 if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp)));
387 if (decompPos == decomp.length()) { // done, have all decomp characters!
388 buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
392 decompCp = UTF16.charAt(decomp, decompPos);
393 decompPos += UTF16.getCharCount(decompCp);
394 //decompClass = getClass(decompCp);
396 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp)));
397 // brute force approach
398 UTF16.append(buf, cp);
400 // since we know that the classes are monotonically increasing, after zero
402 // we can do an optimization
403 // there are only a few cases that work: zero, less, same, greater
404 // if both classes are the same, we fail
405 // if the decomp class < the segment class, we fail
407 segClass = getClass(cp);
408 if (decompClass <= segClass) return null;
412 if (!ok) return null; // we failed, characters left over
413 if (PROGRESS) System.out.println("Matches");
414 if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
415 String remainder = buf.toString();
417 // brute force approach
418 // to check to make sure result is canonically equivalent
420 String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
421 if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
424 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
426 // get the remaining combinations
427 return getEquivalents2(remainder);
431 // TODO: fix once we have a codepoint interface to get the canonical combining class
432 // TODO: Need public access to canonical combining class in UCharacter!
433 private static int getClass(int cp) {
434 return Normalizer.getClass((char)cp);
438 // ================= BUILDER =========================
439 // TODO: Flatten this data so it doesn't have to be reconstructed each time!
441 //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
442 private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change
444 SET_WITH_NULL_STRING.add("");
447 // private static UnicodeSet SAFE_START = new UnicodeSet();
448 // private static CharMap AT_START = new CharMap();
450 // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
451 // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
452 // private static int LAST_UNICODE = 0x10FFFF;
459 private static void buildData() {
461 if (PROGRESS) System.out.println("Getting Safe Start");
462 for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
463 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
464 int cc = UCharacter.getCombiningClass(cp);
465 if (cc == 0) SAFE_START.add(cp);
466 // will fix to be really safe below
468 if (PROGRESS) System.out.println();
470 if (PROGRESS) System.out.println("Getting Containment");
471 for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
472 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
474 if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
476 //String istr = UTF16.valueOf(cp);
477 String decomp = Normalizer.normalize(cp, Normalizer.NFD);
478 //if (decomp.equals(istr)) continue;
480 // add each character in the decomposition to canBeIn
483 for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
484 component = UTF16.charAt(decomp, i);
486 AT_START.add(component, cp);
487 } else if (UCharacter.getCombiningClass(component) == 0) {
488 SAFE_START.remove(component);
492 if (PROGRESS) System.out.println();
494 // the following is just for a map from characters to a set of characters
496 private static class CharMap {
497 Map storage = new HashMap();
498 MutableInt probe = new MutableInt();
499 boolean converted = false;
501 public void add(int cp, int whatItIsIn) {
502 UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
503 if (result == null) {
504 result = new UnicodeSet();
505 storage.put(probe, result);
507 result.add(whatItIsIn);
510 public UnicodeSet get(int cp) {
511 return (UnicodeSet) storage.get(probe.set(cp));
515 private static class MutableInt {
517 public int hashCode() { return contents; }
518 public boolean equals(Object other) {
519 return ((MutableInt)other).contents == contents;
522 public MutableInt set(int contents) {
523 this.contents = contents;