2 *******************************************************************************
\r
3 * Copyright (C) 1996-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import com.ibm.icu.lang.*;
\r
11 import com.ibm.icu.impl.NormalizerImpl;
\r
12 import com.ibm.icu.impl.USerializedSet;
\r
13 import com.ibm.icu.impl.Utility;
\r
16 * This class allows one to iterate through all the strings that are canonically equivalent to a given
\r
17 * string. For example, here are some sample results:
\r
18 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
\r
20 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
\r
21 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
\r
22 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
\r
23 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
\r
24 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA}
\r
25 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE}
\r
26 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA}
\r
27 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE}
\r
28 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA}
\r
29 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE}
\r
30 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA}
\r
31 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE}
\r
33 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
\r
34 * since it has not been optimized for that situation.
\r
39 public final class CanonicalIterator {
\r
41 * Construct a CanonicalIterator object
\r
42 * @param source string to get results for
\r
45 public CanonicalIterator(String source) {
\r
50 * Gets the NFD form of the current source we are iterating over.
\r
51 * @return gets the source: NOTE: it is the NFD form of the source originally passed in
\r
54 public String getSource() {
\r
59 * Resets the iterator so that one can start again from the beginning.
\r
62 public void reset() {
\r
64 for (int i = 0; i < current.length; ++i) {
\r
70 * Get the next canonically equivalent string.
\r
71 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
\r
72 * @return the next string that is canonically equivalent. The value null is returned when
\r
73 * the iteration is done.
\r
76 public String next() {
\r
77 if (done) return null;
\r
79 // construct return value
\r
81 buffer.setLength(0); // delete old contents
\r
82 for (int i = 0; i < pieces.length; ++i) {
\r
83 buffer.append(pieces[i][current[i]]);
\r
85 String result = buffer.toString();
\r
87 // find next value for next time
\r
89 for (int i = current.length - 1; ; --i) {
\r
95 if (current[i] < pieces[i].length) break; // got sequence
\r
102 * Set a new source for this iterator. Allows object reuse.
\r
103 * @param newSource the source string to iterate against. This allows the same iterator to be used
\r
104 * while changing the source string, saving object creation.
\r
107 public void setSource(String newSource) {
\r
108 source = Normalizer.normalize(newSource, Normalizer.NFD);
\r
111 // catch degenerate case
\r
112 if (newSource.length() == 0) {
\r
113 pieces = new String[1][];
\r
114 current = new int[1];
\r
115 pieces[0] = new String[]{""};
\r
119 // find the segments
\r
120 List segmentList = new ArrayList();
\r
124 // i should be the end of the first code point
\r
125 // break up the string into segements
\r
127 int i = UTF16.findOffsetFromCodePoint(source, 1);
\r
129 for (; i < source.length(); i += UTF16.getCharCount(cp)) {
\r
130 cp = UTF16.charAt(source, i);
\r
131 if (NormalizerImpl.isCanonSafeStart(cp)) {
\r
132 segmentList.add(source.substring(start, i)); // add up to i
\r
136 segmentList.add(source.substring(start, i)); // add last one
\r
138 // allocate the arrays, and find the strings that are CE to each segment
\r
139 pieces = new String[segmentList.size()][];
\r
140 current = new int[segmentList.size()];
\r
141 for (i = 0; i < pieces.length; ++i) {
\r
142 if (PROGRESS) System.out.println("SEGMENT");
\r
143 pieces[i] = getEquivalents((String) segmentList.get(i));
\r
148 * Simple implementation of permutation.
\r
149 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
\r
150 * @param source the string to find permutations for
\r
151 * @param skipZeros set to true to skip characters with canonical combining class zero
\r
152 * @param output the set to add the results to
\r
154 * @deprecated This API is ICU internal only.
\r
156 public static void permute(String source, boolean skipZeros, Set output) {
\r
158 //if (PROGRESS) System.out.println("Permute: " + source);
\r
161 // if zero or one character, just return a set with it
\r
162 // we check for length < 2 to keep from counting code points all the time
\r
163 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
\r
164 output.add(source);
\r
168 // otherwise iterate through the string, and recursively permute all the other characters
\r
169 Set subpermute = new HashSet();
\r
171 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
\r
172 cp = UTF16.charAt(source, i);
\r
175 // if the character is canonical combining class zero,
\r
176 // don't permute it
\r
177 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) {
\r
178 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
\r
182 // see what the permutations of the characters before and after this one are
\r
183 subpermute.clear();
\r
184 permute(source.substring(0,i)
\r
185 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
\r
187 // prefix this character to all of them
\r
188 String chStr = UTF16.valueOf(source, i);
\r
189 Iterator it = subpermute.iterator();
\r
190 while (it.hasNext()) {
\r
191 String piece = chStr + (String) it.next();
\r
192 //if (PROGRESS) System.out.println(" Piece: " + piece);
\r
201 *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
\r
204 public static UnicodeSet getSafeStart() {
\r
205 return (UnicodeSet) SAFE_START.clone();
\r
209 *@return the set of characters whose decompositions start with the given character
\r
212 public static UnicodeSet getStarts(int cp) {
\r
213 UnicodeSet result = AT_START.get(cp);
\r
214 if (result == null) result = EMPTY;
\r
215 return (UnicodeSet) result.clone();
\r
219 // ===================== PRIVATES ==============================
\r
222 private static boolean PROGRESS = false; // debug progress
\r
223 //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null;
\r
224 private static boolean SKIP_ZEROS = true;
\r
227 private String source;
\r
228 private boolean done;
\r
229 private String[][] pieces;
\r
230 private int[] current;
\r
231 // Note: C will need two more fields, since arrays there don't have lengths
\r
232 // int pieces_length;
\r
233 // int[] pieces_lengths;
\r
235 // transient fields
\r
236 private transient StringBuffer buffer = new StringBuffer();
\r
239 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
\r
240 private String[] getEquivalents(String segment) {
\r
241 Set result = new HashSet();
\r
242 Set basic = getEquivalents2(segment);
\r
243 Set permutations = new HashSet();
\r
245 // now get all the permutations
\r
246 // add only the ones that are canonically equivalent
\r
247 // TODO: optimize by not permuting any class zero.
\r
248 Iterator it = basic.iterator();
\r
249 while (it.hasNext()) {
\r
250 String item = (String) it.next();
\r
251 permutations.clear();
\r
252 permute(item, SKIP_ZEROS, permutations);
\r
253 Iterator it2 = permutations.iterator();
\r
254 while (it2.hasNext()) {
\r
255 String possible = (String) it2.next();
\r
258 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
\r
259 if (attempt.equals(segment)) {
\r
261 if (Normalizer.compare(possible, segment,0)==0) {
\r
263 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible));
\r
264 result.add(possible);
\r
267 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible));
\r
272 // convert into a String[] to clean up storage
\r
273 String[] finalResult = new String[result.size()];
\r
274 result.toArray(finalResult);
\r
275 return finalResult;
\r
279 private Set getEquivalents2(String segment) {
\r
281 Set result = new HashSet();
\r
283 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment));
\r
285 result.add(segment);
\r
286 StringBuffer workingBuffer = new StringBuffer();
\r
288 // cycle through all the characters
\r
290 int[] range = new int[2];
\r
291 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
\r
293 // see if any character is at the start of some decomposition
\r
294 cp = UTF16.charAt(segment, i);
\r
295 USerializedSet starts = new USerializedSet();
\r
297 if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
\r
301 // if so, see which decompositions match
\r
302 int rangeCount = starts.countRanges();
\r
303 for(j = 0; j < rangeCount; ++j) {
\r
304 starts.getRange(j, range);
\r
306 for (int cp2 = range[0]; cp2 <= end; ++cp2) {
\r
307 Set remainder = extract(cp2, segment, i, workingBuffer);
\r
308 if (remainder == null) continue;
\r
310 // there were some matches, so add all the possibilities to the set.
\r
311 String prefix= segment.substring(0,i);
\r
312 prefix += UTF16.valueOf(cp2);
\r
314 Iterator iter = remainder.iterator();
\r
315 while (iter.hasNext()) {
\r
316 String item = (String) iter.next();
\r
317 String toAdd = new String(prefix);
\r
320 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
\r
327 Set result = new HashSet();
\r
328 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
\r
329 result.add(segment);
\r
330 StringBuffer workingBuffer = new StringBuffer();
\r
332 // cycle through all the characters
\r
335 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
\r
336 // see if any character is at the start of some decomposition
\r
337 cp = UTF16.charAt(segment, i);
\r
338 NormalizerImpl.getCanonStartSet(c,fillSet)
\r
339 UnicodeSet starts = AT_START.get(cp);
\r
340 if (starts == null) continue;
\r
341 UnicodeSetIterator usi = new UnicodeSetIterator(starts);
\r
342 // if so, see which decompositions match
\r
343 while (usi.next()) {
\r
344 int cp2 = usi.codepoint;
\r
345 // we know that there are no strings in it
\r
346 // so we don't have to check CharacterIterator.IS_STRING
\r
347 Set remainder = extract(cp2, segment, i, workingBuffer);
\r
348 if (remainder == null) continue;
\r
350 // there were some matches, so add all the possibilities to the set.
\r
351 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
\r
352 Iterator it = remainder.iterator();
\r
353 while (it.hasNext()) {
\r
354 String item = (String) it.next();
\r
355 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
\r
356 result.add(prefix + item);
\r
365 * See if the decomposition of cp2 is at segment starting at segmentPos
\r
366 * (with canonical rearrangment!)
\r
367 * If so, take the remainder, and return the equivalents
\r
369 private Set extract(int comp, String segment, int segmentPos, StringBuffer buf) {
\r
370 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
\r
371 + ", " + Utility.hex(segment.substring(segmentPos)));
\r
373 //String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
\r
374 String decomp = Normalizer.normalize(comp, Normalizer.NFD);
\r
376 // See if it matches the start of segment (at segmentPos)
\r
377 boolean ok = false;
\r
380 int decompCp = UTF16.charAt(decomp,0);
\r
381 decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
\r
382 //int decompClass = getClass(decompCp);
\r
383 buf.setLength(0); // initialize working buffer, shared among callees
\r
385 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
\r
386 cp = UTF16.charAt(segment, i);
\r
387 if (cp == decompCp) { // if equal, eat another cp from decomp
\r
388 if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp)));
\r
389 if (decompPos == decomp.length()) { // done, have all decomp characters!
\r
390 buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
\r
394 decompCp = UTF16.charAt(decomp, decompPos);
\r
395 decompPos += UTF16.getCharCount(decompCp);
\r
396 //decompClass = getClass(decompCp);
\r
398 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp)));
\r
399 // brute force approach
\r
400 UTF16.append(buf, cp);
\r
402 // since we know that the classes are monotonically increasing, after zero
\r
403 // e.g. 0 5 7 9 0 3
\r
404 // we can do an optimization
\r
405 // there are only a few cases that work: zero, less, same, greater
\r
406 // if both classes are the same, we fail
\r
407 // if the decomp class < the segment class, we fail
\r
409 segClass = getClass(cp);
\r
410 if (decompClass <= segClass) return null;
\r
414 if (!ok) return null; // we failed, characters left over
\r
415 if (PROGRESS) System.out.println("Matches");
\r
416 if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder
\r
417 String remainder = buf.toString();
\r
419 // brute force approach
\r
420 // to check to make sure result is canonically equivalent
\r
422 String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
\r
423 if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
\r
426 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
\r
428 // get the remaining combinations
\r
429 return getEquivalents2(remainder);
\r
433 // TODO: fix once we have a codepoint interface to get the canonical combining class
\r
434 // TODO: Need public access to canonical combining class in UCharacter!
\r
435 private static int getClass(int cp) {
\r
436 return Normalizer.getClass((char)cp);
\r
440 // ================= BUILDER =========================
\r
441 // TODO: Flatten this data so it doesn't have to be reconstructed each time!
\r
443 //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change
\r
444 private static final Set SET_WITH_NULL_STRING = new HashSet(); // constant, don't change
\r
446 SET_WITH_NULL_STRING.add("");
\r
449 // private static UnicodeSet SAFE_START = new UnicodeSet();
\r
450 // private static CharMap AT_START = new CharMap();
\r
452 // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
\r
453 // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
\r
454 // private static int LAST_UNICODE = 0x10FFFF;
\r
461 private static void buildData() {
\r
463 if (PROGRESS) System.out.println("Getting Safe Start");
\r
464 for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
\r
465 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
\r
466 int cc = UCharacter.getCombiningClass(cp);
\r
467 if (cc == 0) SAFE_START.add(cp);
\r
468 // will fix to be really safe below
\r
470 if (PROGRESS) System.out.println();
\r
472 if (PROGRESS) System.out.println("Getting Containment");
\r
473 for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
\r
474 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
\r
476 if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
\r
478 //String istr = UTF16.valueOf(cp);
\r
479 String decomp = Normalizer.normalize(cp, Normalizer.NFD);
\r
480 //if (decomp.equals(istr)) continue;
\r
482 // add each character in the decomposition to canBeIn
\r
485 for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) {
\r
486 component = UTF16.charAt(decomp, i);
\r
488 AT_START.add(component, cp);
\r
489 } else if (UCharacter.getCombiningClass(component) == 0) {
\r
490 SAFE_START.remove(component);
\r
494 if (PROGRESS) System.out.println();
\r
496 // the following is just for a map from characters to a set of characters
\r
498 private static class CharMap {
\r
499 Map storage = new HashMap();
\r
500 MutableInt probe = new MutableInt();
\r
501 boolean converted = false;
\r
503 public void add(int cp, int whatItIsIn) {
\r
504 UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp));
\r
505 if (result == null) {
\r
506 result = new UnicodeSet();
\r
507 storage.put(probe, result);
\r
509 result.add(whatItIsIn);
\r
512 public UnicodeSet get(int cp) {
\r
513 return (UnicodeSet) storage.get(probe.set(cp));
\r
517 private static class MutableInt {
\r
518 public int contents;
\r
519 public int hashCode() { return contents; }
\r
520 public boolean equals(Object other) {
\r
521 return ((MutableInt)other).contents == contents;
\r
524 public MutableInt set(int contents) {
\r
525 this.contents = contents;
\r