2 *******************************************************************************
\r
3 * Copyright (C) 1996-2007, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
8 import com.ibm.icu.impl.NormalizerImpl;
\r
11 * <tt>ComposedCharIter</tt> is an iterator class that returns all
\r
12 * of the precomposed characters defined in the Unicode standard, along
\r
13 * with their decomposed forms. This is often useful when building
\r
14 * data tables (<i>e.g.</i> collation tables) which need to treat composed
\r
15 * and decomposed characters equivalently.
\r
17 * For example, imagine that you have built a collation table with ordering
\r
18 * rules for the {@link Normalizer#DECOMP canonically decomposed} forms of all
\r
19 * characters used in a particular language. When you process input text using
\r
20 * this table, the text must first be decomposed so that it matches the form
\r
21 * used in the table. This can impose a performance penalty that may be
\r
22 * unacceptable in some situations.
\r
24 * You can avoid this problem by ensuring that the collation table contains
\r
25 * rules for both the decomposed <i>and</i> composed versions of each character.
\r
26 * To do so, use a <tt>ComposedCharIter</tt> to iterate through all of the
\r
27 * composed characters in Unicode. If the decomposition for that character
\r
28 * consists solely of characters that are listed in your ruleset, you can
\r
29 * add a new rule for the composed character that makes it equivalent to
\r
30 * its decomposition sequence.
\r
32 * Note that <tt>ComposedCharIter</tt> iterates over a <em>static</em> table
\r
33 * of the composed characters in Unicode. If you want to iterate over the
\r
34 * composed characters in a particular string, use {@link Normalizer} instead.
\r
36 * When constructing a <tt>ComposedCharIter</tt> there is one
\r
37 * optional feature that you can enable or disable:
\r
39 * <li>{@link Normalizer#IGNORE_HANGUL} - Do not iterate over the Hangul
\r
40 * characters and their corresponding Jamo decompositions.
\r
41 * This option is off by default (<i>i.e.</i> Hangul processing is enabled)
\r
42 * since the Unicode standard specifies that Hangul to Jamo
\r
43 * is a canonical decomposition.
\r
46 * <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
\r
47 * <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
\r
48 * It will be updated as later versions of Unicode are released.
\r
49 * @deprecated ICU 2.2
\r
52 public final class ComposedCharIter {
\r
55 * Constant that indicates the iteration has completed.
\r
56 * {@link #next} returns this value when there are no more composed characters
\r
57 * over which to iterate.
\r
58 * @deprecated ICU 2.2
\r
60 public static final char DONE = (char) Normalizer.DONE;
\r
63 * Construct a new <tt>ComposedCharIter</tt>. The iterator will return
\r
64 * all Unicode characters with canonical decompositions, including Korean
\r
65 * Hangul characters.
\r
66 * @deprecated ICU 2.2
\r
68 public ComposedCharIter() {
\r
75 * Constructs a non-default <tt>ComposedCharIter</tt> with optional behavior.
\r
77 * @param compat <tt>false</tt> for canonical decompositions only;
\r
78 * <tt>true</tt> for both canonical and compatibility
\r
81 * @param options Optional decomposition features. Currently, the only
\r
82 * supported option is {@link Normalizer#IGNORE_HANGUL}, which
\r
83 * causes this <tt>ComposedCharIter</tt> not to iterate
\r
84 * over the Hangul characters and their corresponding
\r
85 * Jamo decompositions.
\r
86 * @deprecated ICU 2.2
\r
88 public ComposedCharIter(boolean compat, int options) {
\r
89 this.compat = compat;
\r
90 //this.options = options;
\r
94 * Determines whether there any precomposed Unicode characters not yet returned
\r
96 * @deprecated ICU 2.2
\r
98 public boolean hasNext() {
\r
99 if (nextChar == Normalizer.DONE) {
\r
102 return nextChar != Normalizer.DONE;
\r
106 * Returns the next precomposed Unicode character.
\r
107 * Repeated calls to <tt>next</tt> return all of the precomposed characters defined
\r
108 * by Unicode, in ascending order. After all precomposed characters have
\r
109 * been returned, {@link #hasNext} will return <tt>false</tt> and further calls
\r
110 * to <tt>next</tt> will return {@link #DONE}.
\r
111 * @deprecated ICU 2.2
\r
113 public char next() {
\r
114 if (nextChar == Normalizer.DONE) {
\r
117 curChar = nextChar;
\r
118 nextChar = Normalizer.DONE;
\r
119 return (char) curChar;
\r
123 * Returns the Unicode decomposition of the current character.
\r
124 * This method returns the decomposition of the precomposed character most
\r
125 * recently returned by {@link #next}. The resulting decomposition is
\r
126 * affected by the settings of the options passed to the constructor.
\r
127 * @deprecated ICU 2.2
\r
129 public String decomposition() {
\r
130 // the decomposition buffer contains the decomposition of
\r
131 // current char so just return it
\r
132 return new String(decompBuf,0, bufLen);
\r
135 private void findNextChar() {
\r
139 bufLen = NormalizerImpl.getDecomposition(c,compat,
\r
143 // the curChar can be decomposed... so it is a composed char
\r
144 // cache the result
\r
156 //private int options;
\r
157 private boolean compat;
\r
158 private char[] decompBuf = new char[100];
\r
159 private int bufLen=0;
\r
160 private int curChar = 0;
\r
161 private int nextChar = Normalizer.DONE;
\r