2 *******************************************************************************
3 * Copyright (C) 2000-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
8 import java.nio.CharBuffer;
9 import java.text.CharacterIterator;
11 import com.ibm.icu.impl.Norm2AllModes;
12 import com.ibm.icu.impl.Normalizer2Impl;
13 import com.ibm.icu.impl.UCaseProps;
14 import com.ibm.icu.lang.UCharacter;
17 * Unicode Normalization
19 * <h2>Unicode normalization API</h2>
21 * <code>normalize</code> transforms Unicode text into an equivalent composed or
22 * decomposed form, allowing for easier sorting and searching of text.
23 * <code>normalize</code> supports the standard normalization forms described in
24 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
25 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
27 * Characters with accents or other adornments can be encoded in
28 * several different ways in Unicode. For example, take the character A-acute.
29 * In Unicode, this can be encoded as a single character (the
33 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
36 * or as two separate characters (the "decomposed" form):
39 * 0041 LATIN CAPITAL LETTER A
40 * 0301 COMBINING ACUTE ACCENT
43 * To a user of your program, however, both of these sequences should be
44 * treated as the same "user-level" character "A with acute accent". When you
45 * are searching or comparing text, you must ensure that these two sequences are
46 * treated equivalently. In addition, you must handle characters with more than
47 * one accent. Sometimes the order of a character's combining accents is
48 * significant, while in other cases accent sequences in different orders are
51 * Similarly, the string "ffi" can be encoded as three separate letters:
54 * 0066 LATIN SMALL LETTER F
55 * 0066 LATIN SMALL LETTER F
56 * 0069 LATIN SMALL LETTER I
59 * or as the single character
62 * FB03 LATIN SMALL LIGATURE FFI
65 * The ffi ligature is not a distinct semantic character, and strictly speaking
66 * it shouldn't be in Unicode at all, but it was included for compatibility
67 * with existing character sets that already provided it. The Unicode standard
68 * identifies such characters by giving them "compatibility" decompositions
69 * into the corresponding semantic characters. When sorting and searching, you
70 * will often want to use these mappings.
72 * <code>normalize</code> helps solve these problems by transforming text into
73 * the canonical composed and decomposed forms as shown in the first example
74 * above. In addition, you can have it perform compatibility decompositions so
75 * that you can treat compatibility characters the same as their equivalents.
76 * Finally, <code>normalize</code> rearranges accents into the proper canonical
77 * order, so that you do not have to worry about accent rearrangement on your
80 * Form FCD, "Fast C or D", is also designed for collation.
81 * It allows to work on strings that are not necessarily normalized
82 * with an algorithm (like in collation) that works under "canonical closure",
83 * i.e., it treats precomposed characters and their decomposed equivalents the
86 * It is not a normalization form because it does not provide for uniqueness of
87 * representation. Multiple strings may be canonically equivalent (their NFDs
88 * are identical) and may all conform to FCD without being identical themselves.
90 * The form is defined such that the "raw decomposition", the recursive
91 * canonical decomposition of each character, results in a string that is
92 * canonically ordered. This means that precomposed characters are allowed for
93 * as long as their decompositions do not need canonical reordering.
95 * Its advantage for a process like collation is that all NFD and most NFC texts
96 * - and many unnormalized texts - already conform to FCD and do not need to be
97 * normalized (NFD) for such a process. The FCD quick check will return YES for
98 * most strings in practice.
100 * normalize(FCD) may be implemented with NFD.
102 * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
103 * http://www.unicode.org/notes/tn5/#FCD
105 * ICU collation performs either NFD or FCD normalization automatically if
106 * normalization is turned on for the collator object. Beyond collation and
107 * string search, normalized strings may be useful for string equivalence
108 * comparisons, transliteration/transcription, unique representations, etc.
110 * The W3C generally recommends to exchange texts in NFC.
111 * Note also that most legacy character encodings use only precomposed forms and
112 * often do not encode any combining marks by themselves. For conversion to such
113 * character encodings the Unicode text needs to be normalized to NFC.
114 * For more usage examples, see the Unicode Standard Annex.
116 * Note: The Normalizer class also provides API for iterative normalization.
117 * While the setIndex() and getIndex() refer to indices in the
118 * underlying Unicode input text, the next() and previous() methods
119 * iterate through characters in the normalized output.
120 * This means that there is not necessarily a one-to-one correspondence
121 * between characters returned by next() and previous() and the indices
122 * passed to and returned from setIndex() and getIndex().
123 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
127 public final class Normalizer implements Cloneable {
128 // The input text and our position in it
129 private UCharacterIterator text;
130 private Normalizer2 norm2;
134 // The normalization buffer is the result of normalization
135 // of the source in [currentIndex..nextIndex[ .
136 private int currentIndex;
137 private int nextIndex;
139 // A buffer for holding intermediate results
140 private StringBuilder buffer;
141 private int bufferPos;
143 // Helper classes to defer loading of normalization data.
144 private static final class ModeImpl {
145 private ModeImpl(Normalizer2 n2) {
148 private final Normalizer2 normalizer2;
150 private static final class NFDModeImpl {
151 private static final ModeImpl INSTANCE =
152 new ModeImpl(Norm2AllModes.getNFCInstance().decomp);
154 private static final class NFKDModeImpl {
155 private static final ModeImpl INSTANCE =
156 new ModeImpl(Norm2AllModes.getNFKCInstance().decomp);
158 private static final class NFCModeImpl {
159 private static final ModeImpl INSTANCE =
160 new ModeImpl(Norm2AllModes.getNFCInstance().comp);
162 private static final class NFKCModeImpl {
163 private static final ModeImpl INSTANCE =
164 new ModeImpl(Norm2AllModes.getNFKCInstance().comp);
166 private static final class FCDModeImpl {
167 private static final ModeImpl INSTANCE =
168 new ModeImpl(Norm2AllModes.getFCDNormalizer2());
171 private static final class Unicode32 {
172 private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
174 private static final class NFD32ModeImpl {
175 private static final ModeImpl INSTANCE =
176 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().decomp,
177 Unicode32.INSTANCE));
179 private static final class NFKD32ModeImpl {
180 private static final ModeImpl INSTANCE =
181 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().decomp,
182 Unicode32.INSTANCE));
184 private static final class NFC32ModeImpl {
185 private static final ModeImpl INSTANCE =
186 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().comp,
187 Unicode32.INSTANCE));
189 private static final class NFKC32ModeImpl {
190 private static final ModeImpl INSTANCE =
191 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().comp,
192 Unicode32.INSTANCE));
194 private static final class FCD32ModeImpl {
195 private static final ModeImpl INSTANCE =
196 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
197 Unicode32.INSTANCE));
201 * Options bit set value to select Unicode 3.2 normalization
202 * (except NormalizationCorrections).
203 * At most one Unicode version can be selected at a time.
206 public static final int UNICODE_3_2=0x20;
209 * Constant indicating that the end of the iteration has been reached.
210 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
213 public static final int DONE = UCharacterIterator.DONE;
216 * Constants for normalization modes.
218 * The Mode class is not intended for public subclassing.
219 * Only the Mode constants provided by the Normalizer class should be used,
220 * and any fields or methods should not be called or overridden by users.
223 public static abstract class Mode {
226 * @deprecated This API is ICU internal only.
228 protected abstract Normalizer2 getNormalizer2(int options);
231 private static final class NONEMode extends Mode {
232 protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
234 private static final class NFDMode extends Mode {
235 protected Normalizer2 getNormalizer2(int options) {
236 return (options&UNICODE_3_2) != 0 ?
237 NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
240 private static final class NFKDMode extends Mode {
241 protected Normalizer2 getNormalizer2(int options) {
242 return (options&UNICODE_3_2) != 0 ?
243 NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
246 private static final class NFCMode extends Mode {
247 protected Normalizer2 getNormalizer2(int options) {
248 return (options&UNICODE_3_2) != 0 ?
249 NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
252 private static final class NFKCMode extends Mode {
253 protected Normalizer2 getNormalizer2(int options) {
254 return (options&UNICODE_3_2) != 0 ?
255 NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
258 private static final class FCDMode extends Mode {
259 protected Normalizer2 getNormalizer2(int options) {
260 return (options&UNICODE_3_2) != 0 ?
261 FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
266 * No decomposition/composition.
269 public static final Mode NONE = new NONEMode();
272 * Canonical decomposition.
275 public static final Mode NFD = new NFDMode();
278 * Compatibility decomposition.
281 public static final Mode NFKD = new NFKDMode();
284 * Canonical decomposition followed by canonical composition.
287 public static final Mode NFC = new NFCMode();
290 * Default normalization.
293 public static final Mode DEFAULT = NFC;
296 * Compatibility decomposition followed by canonical composition.
299 public static final Mode NFKC =new NFKCMode();
302 * "Fast C or D" form.
305 public static final Mode FCD = new FCDMode();
308 * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
309 * and the static {@link #normalize normalize} method. This value tells
310 * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
311 * from the underlying String or CharacterIterator. If you have code which
312 * requires raw text at some times and normalized text at others, you can
313 * use <tt>NO_OP</tt> for the cases where you want raw text, rather
314 * than having a separate code path that bypasses <tt>Normalizer</tt>
318 * @deprecated ICU 2.8. Use Nomalizer.NONE
321 public static final Mode NO_OP = NONE;
324 * Canonical decomposition followed by canonical composition. Used with the
325 * {@link com.ibm.icu.text.Normalizer constructors} and the static
326 * {@link #normalize normalize} method to determine the operation to be
329 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
330 * off, this operation produces output that is in
331 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
336 * @deprecated ICU 2.8. Use Normalier.NFC
339 public static final Mode COMPOSE = NFC;
342 * Compatibility decomposition followed by canonical composition.
343 * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
344 * {@link #normalize normalize} method to determine the operation to be
347 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
348 * off, this operation produces output that is in
349 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
354 * @deprecated ICU 2.8. Use Normalizer.NFKC
357 public static final Mode COMPOSE_COMPAT = NFKC;
360 * Canonical decomposition. This value is passed to the
361 * {@link com.ibm.icu.text.Normalizer constructors} and the static
362 * {@link #normalize normalize}
363 * method to determine the operation to be performed.
365 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
366 * off, this operation produces output that is in
367 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
372 * @deprecated ICU 2.8. Use Normalizer.NFD
375 public static final Mode DECOMP = NFD;
378 * Compatibility decomposition. This value is passed to the
379 * {@link com.ibm.icu.text.Normalizer constructors} and the static
380 * {@link #normalize normalize}
381 * method to determine the operation to be performed.
383 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
384 * off, this operation produces output that is in
385 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
390 * @deprecated ICU 2.8. Use Normalizer.NFKD
393 public static final Mode DECOMP_COMPAT = NFKD;
396 * Option to disable Hangul/Jamo composition and decomposition.
397 * This option applies to Korean text,
398 * which can be represented either in the Jamo alphabet or in Hangul
399 * characters, which are really just two or three Jamo combined
400 * into one visual glyph. Since Jamo takes up more storage space than
401 * Hangul, applications that process only Hangul text may wish to turn
402 * this option on when decomposing text.
404 * The Unicode standard treates Hangul to Jamo conversion as a
405 * canonical decomposition, so this option must be turned <b>off</b> if you
406 * wish to transform strings into one of the standard
407 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
408 * Unicode Normalization Forms</a>.
411 * @deprecated ICU 2.8. This option is no longer supported.
413 public static final int IGNORE_HANGUL = 0x0001;
416 * Result values for quickCheck().
417 * For details see Unicode Technical Report 15.
420 public static final class QuickCheckResult{
421 //private int resultValue;
422 private QuickCheckResult(int value) {
427 * Indicates that string is not in the normalized format
430 public static final QuickCheckResult NO = new QuickCheckResult(0);
433 * Indicates that string is in the normalized format
436 public static final QuickCheckResult YES = new QuickCheckResult(1);
439 * Indicates it cannot be determined if string is in the normalized
440 * format without further thorough checks.
443 public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
446 * Option bit for compare:
447 * Case sensitively compare the strings
450 public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT;
453 * Option bit for compare:
454 * Both input strings are assumed to fulfill FCD conditions.
457 public static final int INPUT_IS_FCD = 0x20000;
460 * Option bit for compare:
461 * Perform case-insensitive comparison.
464 public static final int COMPARE_IGNORE_CASE = 0x10000;
467 * Option bit for compare:
468 * Compare strings in code point order instead of code unit order.
471 public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
474 * Option value for case folding:
475 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
476 * and dotless i appropriately for Turkic languages (tr, az).
477 * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
480 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
483 * Lowest-order bit number of compare() options bits corresponding to
484 * normalization options bits.
486 * The options parameter for compare() uses most bits for
487 * itself and for various comparison and folding flags.
488 * The most significant bits, however, are shifted down and passed on
489 * to the normalization implementation.
490 * (That is, from compare(..., options, ...),
491 * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
492 * internal normalization functions.)
497 public static final int COMPARE_NORM_OPTIONS_SHIFT = 20;
499 //-------------------------------------------------------------------------
500 // Iterator constructors
501 //-------------------------------------------------------------------------
504 * Creates a new <tt>Normalizer</tt> object for iterating over the
505 * normalized form of a given string.
507 * The <tt>options</tt> parameter specifies which optional
508 * <tt>Normalizer</tt> features are to be enabled for this object.
510 * @param str The string to be normalized. The normalization
511 * will start at the beginning of the string.
513 * @param mode The normalization mode.
515 * @param opt Any optional features to be enabled.
516 * Currently the only available option is {@link #UNICODE_3_2}.
517 * If you want the default behavior corresponding to one of the
518 * standard Unicode Normalization Forms, use 0 for this argument.
521 public Normalizer(String str, Mode mode, int opt) {
522 this.text = UCharacterIterator.getInstance(str);
525 norm2 = mode.getNormalizer2(opt);
526 buffer = new StringBuilder();
530 * Creates a new <tt>Normalizer</tt> object for iterating over the
531 * normalized form of the given text.
533 * @param iter The input text to be normalized. The normalization
534 * will start at the beginning of the string.
536 * @param mode The normalization mode.
538 * @param opt Any optional features to be enabled.
539 * Currently the only available option is {@link #UNICODE_3_2}.
540 * If you want the default behavior corresponding to one of the
541 * standard Unicode Normalization Forms, use 0 for this argument.
544 public Normalizer(CharacterIterator iter, Mode mode, int opt) {
545 this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
548 norm2 = mode.getNormalizer2(opt);
549 buffer = new StringBuilder();
553 * Creates a new <tt>Normalizer</tt> object for iterating over the
554 * normalized form of the given text.
556 * @param iter The input text to be normalized. The normalization
557 * will start at the beginning of the string.
559 * @param mode The normalization mode.
560 * @param options The normalization options, ORed together (0 for no options).
563 public Normalizer(UCharacterIterator iter, Mode mode, int options) {
565 this.text = (UCharacterIterator)iter.clone();
567 this.options = options;
568 norm2 = mode.getNormalizer2(options);
569 buffer = new StringBuilder();
570 } catch (CloneNotSupportedException e) {
571 throw new IllegalStateException(e.toString());
576 * Clones this <tt>Normalizer</tt> object. All properties of this
577 * object are duplicated in the new object, including the cloning of any
578 * {@link CharacterIterator} that was passed in to the constructor
579 * or to {@link #setText(CharacterIterator) setText}.
580 * However, the text storage underlying
581 * the <tt>CharacterIterator</tt> is not duplicated unless the
582 * iterator's <tt>clone</tt> method does so.
585 public Object clone() {
587 Normalizer copy = (Normalizer) super.clone();
588 copy.text = (UCharacterIterator) text.clone();
590 copy.options = options;
592 copy.buffer = new StringBuilder(buffer);
593 copy.bufferPos = bufferPos;
594 copy.currentIndex = currentIndex;
595 copy.nextIndex = nextIndex;
598 catch (CloneNotSupportedException e) {
599 throw new IllegalStateException(e);
603 //--------------------------------------------------------------------------
604 // Static Utility methods
605 //--------------------------------------------------------------------------
607 private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
608 return (compat ? NFKC : NFC).getNormalizer2(options);
610 private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
611 return (compat ? NFKD : NFD).getNormalizer2(options);
616 * The string will be composed to according to the specified mode.
617 * @param str The string to compose.
618 * @param compat If true the string will be composed according to
619 * NFKC rules and if false will be composed according to
621 * @return String The composed string
624 public static String compose(String str, boolean compat) {
625 return compose(str,compat,0);
630 * The string will be composed to according to the specified mode.
631 * @param str The string to compose.
632 * @param compat If true the string will be composed according to
633 * NFKC rules and if false will be composed according to
635 * @param options The only recognized option is UNICODE_3_2
636 * @return String The composed string
639 public static String compose(String str, boolean compat, int options) {
640 return getComposeNormalizer2(compat, options).normalize(str);
645 * The string will be composed to according to the specified mode.
646 * @param source The char array to compose.
647 * @param target A char buffer to receive the normalized text.
648 * @param compat If true the char array will be composed according to
649 * NFKC rules and if false will be composed according to
651 * @param options The normalization options, ORed together (0 for no options).
652 * @return int The total buffer size needed;if greater than length of
653 * result, the output was truncated.
654 * @exception IndexOutOfBoundsException if target.length is less than the
658 public static int compose(char[] source,char[] target, boolean compat, int options) {
659 return compose(source, 0, source.length, target, 0, target.length, compat, options);
664 * The string will be composed to according to the specified mode.
665 * @param src The char array to compose.
666 * @param srcStart Start index of the source
667 * @param srcLimit Limit index of the source
668 * @param dest The char buffer to fill in
669 * @param destStart Start index of the destination buffer
670 * @param destLimit End index of the destination buffer
671 * @param compat If true the char array will be composed according to
672 * NFKC rules and if false will be composed according to
674 * @param options The normalization options, ORed together (0 for no options).
675 * @return int The total buffer size needed;if greater than length of
676 * result, the output was truncated.
677 * @exception IndexOutOfBoundsException if target.length is less than the
681 public static int compose(char[] src,int srcStart, int srcLimit,
682 char[] dest,int destStart, int destLimit,
683 boolean compat, int options) {
684 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
685 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
686 getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
691 * Decompose a string.
692 * The string will be decomposed to according to the specified mode.
693 * @param str The string to decompose.
694 * @param compat If true the string will be decomposed according to NFKD
695 * rules and if false will be decomposed according to NFD
697 * @return String The decomposed string
700 public static String decompose(String str, boolean compat) {
701 return decompose(str,compat,0);
705 * Decompose a string.
706 * The string will be decomposed to according to the specified mode.
707 * @param str The string to decompose.
708 * @param compat If true the string will be decomposed according to NFKD
709 * rules and if false will be decomposed according to NFD
711 * @param options The normalization options, ORed together (0 for no options).
712 * @return String The decomposed string
715 public static String decompose(String str, boolean compat, int options) {
716 return getDecomposeNormalizer2(compat, options).normalize(str);
720 * Decompose a string.
721 * The string will be decomposed to according to the specified mode.
722 * @param source The char array to decompose.
723 * @param target A char buffer to receive the normalized text.
724 * @param compat If true the char array will be decomposed according to NFKD
725 * rules and if false will be decomposed according to
727 * @return int The total buffer size needed;if greater than length of
728 * result,the output was truncated.
729 * @param options The normalization options, ORed together (0 for no options).
730 * @exception IndexOutOfBoundsException if the target capacity is less than
731 * the required length
734 public static int decompose(char[] source,char[] target, boolean compat, int options) {
735 return decompose(source, 0, source.length, target, 0, target.length, compat, options);
739 * Decompose a string.
740 * The string will be decomposed to according to the specified mode.
741 * @param src The char array to compose.
742 * @param srcStart Start index of the source
743 * @param srcLimit Limit index of the source
744 * @param dest The char buffer to fill in
745 * @param destStart Start index of the destination buffer
746 * @param destLimit End index of the destination buffer
747 * @param compat If true the char array will be decomposed according to NFKD
748 * rules and if false will be decomposed according to
750 * @param options The normalization options, ORed together (0 for no options).
751 * @return int The total buffer size needed;if greater than length of
752 * result,the output was truncated.
753 * @exception IndexOutOfBoundsException if the target capacity is less than
754 * the required length
757 public static int decompose(char[] src,int srcStart, int srcLimit,
758 char[] dest,int destStart, int destLimit,
759 boolean compat, int options) {
760 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
761 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
762 getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
767 * Normalizes a <tt>String</tt> using the given normalization operation.
769 * The <tt>options</tt> parameter specifies which optional
770 * <tt>Normalizer</tt> features are to be enabled for this operation.
771 * Currently the only available option is {@link #UNICODE_3_2}.
772 * If you want the default behavior corresponding to one of the standard
773 * Unicode Normalization Forms, use 0 for this argument.
775 * @param str the input string to be normalized.
776 * @param mode the normalization mode
777 * @param options the optional features to be enabled.
778 * @return String the normalized string
781 public static String normalize(String str, Mode mode, int options) {
782 return mode.getNormalizer2(options).normalize(str);
786 * Normalize a string.
787 * The string will be normalized according to the specified normalization
789 * @param src The string to normalize.
790 * @param mode The normalization mode; one of Normalizer.NONE,
791 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
792 * Normalizer.NFKD, Normalizer.DEFAULT
793 * @return the normalized string
797 public static String normalize(String src,Mode mode) {
798 return normalize(src, mode, 0);
801 * Normalize a string.
802 * The string will be normalized according to the specified normalization
804 * @param source The char array to normalize.
805 * @param target A char buffer to receive the normalized text.
806 * @param mode The normalization mode; one of Normalizer.NONE,
807 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
808 * Normalizer.NFKD, Normalizer.DEFAULT
809 * @param options The normalization options, ORed together (0 for no options).
810 * @return int The total buffer size needed;if greater than length of
811 * result, the output was truncated.
812 * @exception IndexOutOfBoundsException if the target capacity is less
813 * than the required length
816 public static int normalize(char[] source,char[] target, Mode mode, int options) {
817 return normalize(source,0,source.length,target,0,target.length,mode, options);
821 * Normalize a string.
822 * The string will be normalized according to the specified normalization
824 * @param src The char array to compose.
825 * @param srcStart Start index of the source
826 * @param srcLimit Limit index of the source
827 * @param dest The char buffer to fill in
828 * @param destStart Start index of the destination buffer
829 * @param destLimit End index of the destination buffer
830 * @param mode The normalization mode; one of Normalizer.NONE,
831 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
832 * Normalizer.NFKD, Normalizer.DEFAULT
833 * @param options The normalization options, ORed together (0 for no options).
834 * @return int The total buffer size needed;if greater than length of
835 * result, the output was truncated.
836 * @exception IndexOutOfBoundsException if the target capacity is
837 * less than the required length
840 public static int normalize(char[] src,int srcStart, int srcLimit,
841 char[] dest,int destStart, int destLimit,
842 Mode mode, int options) {
843 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
844 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
845 mode.getNormalizer2(options).normalize(srcBuffer, app);
850 * Normalize a codepoint according to the given mode
851 * @param char32 The input string to be normalized.
852 * @param mode The normalization mode
853 * @param options Options for use with exclusion set and tailored Normalization
854 * The only option that is currently recognized is UNICODE_3_2
855 * @return String The normalized string
859 public static String normalize(int char32, Mode mode, int options) {
860 if(mode == NFD && options == 0) {
861 String decomposition =
862 Norm2AllModes.getNFCInstance().impl.getDecomposition(char32);
863 if(decomposition == null) {
864 decomposition = UTF16.valueOf(char32);
866 return decomposition;
868 return normalize(UTF16.valueOf(char32), mode, options);
872 * Convenience method to normalize a codepoint according to the given mode
873 * @param char32 The input string to be normalized.
874 * @param mode The normalization mode
875 * @return String The normalized string
878 public static String normalize(int char32, Mode mode) {
879 return normalize(char32, mode, 0);
883 * Convenience method.
885 * @param source string for determining if it is in a normalized format
886 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
887 * Normalizer.NFKC,Normalizer.NFKD)
888 * @return Return code to specify if the text is normalized or not
889 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
892 public static QuickCheckResult quickCheck(String source, Mode mode) {
893 return quickCheck(source, mode, 0);
897 * Performing quick check on a string, to quickly determine if the string is
898 * in a particular normalization format.
899 * Three types of result can be returned Normalizer.YES, Normalizer.NO or
900 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
901 * string is in the desired normalized format, Normalizer.NO determines that
902 * argument string is not in the desired normalized format. A
903 * Normalizer.MAYBE result indicates that a more thorough check is required,
904 * the user may have to put the string in its normalized form and compare
907 * @param source string for determining if it is in a normalized format
908 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
909 * Normalizer.NFKC,Normalizer.NFKD)
910 * @param options Options for use with exclusion set and tailored Normalization
911 * The only option that is currently recognized is UNICODE_3_2
912 * @return Return code to specify if the text is normalized or not
913 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
916 public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
917 return mode.getNormalizer2(options).quickCheck(source);
921 * Convenience method.
923 * @param source Array of characters for determining if it is in a
925 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
926 * Normalizer.NFKC,Normalizer.NFKD)
927 * @param options Options for use with exclusion set and tailored Normalization
928 * The only option that is currently recognized is UNICODE_3_2
929 * @return Return code to specify if the text is normalized or not
930 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
933 public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
934 return quickCheck(source, 0, source.length, mode, options);
938 * Performing quick check on a string, to quickly determine if the string is
939 * in a particular normalization format.
940 * Three types of result can be returned Normalizer.YES, Normalizer.NO or
941 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
942 * string is in the desired normalized format, Normalizer.NO determines that
943 * argument string is not in the desired normalized format. A
944 * Normalizer.MAYBE result indicates that a more thorough check is required,
945 * the user may have to put the string in its normalized form and compare
948 * @param source string for determining if it is in a normalized format
949 * @param start the start index of the source
950 * @param limit the limit index of the source it is equal to the length
951 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
952 * Normalizer.NFKC,Normalizer.NFKD)
953 * @param options Options for use with exclusion set and tailored Normalization
954 * The only option that is currently recognized is UNICODE_3_2
955 * @return Return code to specify if the text is normalized or not
956 * (Normalizer.YES, Normalizer.NO or
961 public static QuickCheckResult quickCheck(char[] source,int start,
962 int limit, Mode mode,int options) {
963 CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
964 return mode.getNormalizer2(options).quickCheck(srcBuffer);
968 * Test if a string is in a given normalization form.
969 * This is semantically equivalent to source.equals(normalize(source, mode)).
971 * Unlike quickCheck(), this function returns a definitive result,
973 * For NFD, NFKD, and FCD, both functions work exactly the same.
974 * For NFC and NFKC where quickCheck may return "maybe", this function will
975 * perform further tests to arrive at a true/false result.
976 * @param src The input array of characters to be checked to see if
978 * @param start The strart index in the source
979 * @param limit The limit index in the source
980 * @param mode the normalization mode
981 * @param options Options for use with exclusion set and tailored Normalization
982 * The only option that is currently recognized is UNICODE_3_2
983 * @return Boolean value indicating whether the source string is in the
984 * "mode" normalization form
987 public static boolean isNormalized(char[] src,int start,
988 int limit, Mode mode,
990 CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
991 return mode.getNormalizer2(options).isNormalized(srcBuffer);
995 * Test if a string is in a given normalization form.
996 * This is semantically equivalent to source.equals(normalize(source, mode)).
998 * Unlike quickCheck(), this function returns a definitive result,
1000 * For NFD, NFKD, and FCD, both functions work exactly the same.
1001 * For NFC and NFKC where quickCheck may return "maybe", this function will
1002 * perform further tests to arrive at a true/false result.
1003 * @param str the input string to be checked to see if it is
1005 * @param mode the normalization mode
1006 * @param options Options for use with exclusion set and tailored Normalization
1007 * The only option that is currently recognized is UNICODE_3_2
1008 * @see #isNormalized
1011 public static boolean isNormalized(String str, Mode mode, int options) {
1012 return mode.getNormalizer2(options).isNormalized(str);
1016 * Convenience Method
1017 * @param char32 the input code point to be checked to see if it is
1019 * @param mode the normalization mode
1020 * @param options Options for use with exclusion set and tailored Normalization
1021 * The only option that is currently recognized is UNICODE_3_2
1023 * @see #isNormalized
1026 public static boolean isNormalized(int char32, Mode mode,int options) {
1027 return isNormalized(UTF16.valueOf(char32), mode, options);
1031 * Compare two strings for canonical equivalence.
1032 * Further options include case-insensitive comparison and
1033 * code point order (as opposed to code unit order).
1035 * Canonical equivalence between two strings is defined as their normalized
1036 * forms (NFD or NFC) being identical.
1037 * This function compares strings incrementally instead of normalizing
1038 * (and optionally case-folding) both strings entirely,
1039 * improving performance significantly.
1041 * Bulk normalization is only necessary if the strings do not fulfill the
1042 * FCD conditions. Only in this case, and only if the strings are relatively
1043 * long, is memory allocated temporarily.
1044 * For FCD strings and short non-FCD strings there is no memory allocation.
1046 * Semantically, this is equivalent to
1047 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1048 * where code point order and foldCase are all optional.
1050 * @param s1 First source character array.
1051 * @param s1Start start index of source
1052 * @param s1Limit limit of the source
1054 * @param s2 Second source character array.
1055 * @param s2Start start index of the source
1056 * @param s2Limit limit of the source
1058 * @param options A bit set of options:
1059 * - FOLD_CASE_DEFAULT or 0 is used for default options:
1060 * Case-sensitive comparison in code unit order, and the input strings
1061 * are quick-checked for FCD.
1064 * Set if the caller knows that both s1 and s2 fulfill the FCD
1065 * conditions.If not set, the function will quickCheck for FCD
1066 * and normalize if necessary.
1068 * - COMPARE_CODE_POINT_ORDER
1069 * Set to choose code point order instead of code unit order
1071 * - COMPARE_IGNORE_CASE
1072 * Set to compare strings case-insensitively using case folding,
1073 * instead of case-sensitively.
1074 * If set, then the following case folding options are used.
1077 * @return <0 or 0 or >0 as usual for string comparisons
1083 public static int compare(char[] s1, int s1Start, int s1Limit,
1084 char[] s2, int s2Start, int s2Limit,
1086 if( s1==null || s1Start<0 || s1Limit<0 ||
1087 s2==null || s2Start<0 || s2Limit<0 ||
1088 s1Limit<s1Start || s2Limit<s2Start
1090 throw new IllegalArgumentException();
1092 return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1093 CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1098 * Compare two strings for canonical equivalence.
1099 * Further options include case-insensitive comparison and
1100 * code point order (as opposed to code unit order).
1102 * Canonical equivalence between two strings is defined as their normalized
1103 * forms (NFD or NFC) being identical.
1104 * This function compares strings incrementally instead of normalizing
1105 * (and optionally case-folding) both strings entirely,
1106 * improving performance significantly.
1108 * Bulk normalization is only necessary if the strings do not fulfill the
1109 * FCD conditions. Only in this case, and only if the strings are relatively
1110 * long, is memory allocated temporarily.
1111 * For FCD strings and short non-FCD strings there is no memory allocation.
1113 * Semantically, this is equivalent to
1114 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1115 * where code point order and foldCase are all optional.
1117 * @param s1 First source string.
1118 * @param s2 Second source string.
1120 * @param options A bit set of options:
1121 * - FOLD_CASE_DEFAULT or 0 is used for default options:
1122 * Case-sensitive comparison in code unit order, and the input strings
1123 * are quick-checked for FCD.
1126 * Set if the caller knows that both s1 and s2 fulfill the FCD
1127 * conditions. If not set, the function will quickCheck for FCD
1128 * and normalize if necessary.
1130 * - COMPARE_CODE_POINT_ORDER
1131 * Set to choose code point order instead of code unit order
1133 * - COMPARE_IGNORE_CASE
1134 * Set to compare strings case-insensitively using case folding,
1135 * instead of case-sensitively.
1136 * If set, then the following case folding options are used.
1138 * @return <0 or 0 or >0 as usual for string comparisons
1144 public static int compare(String s1, String s2, int options) {
1145 return internalCompare(s1, s2, options);
1149 * Compare two strings for canonical equivalence.
1150 * Further options include case-insensitive comparison and
1151 * code point order (as opposed to code unit order).
1152 * Convenience method.
1154 * @param s1 First source string.
1155 * @param s2 Second source string.
1157 * @param options A bit set of options:
1158 * - FOLD_CASE_DEFAULT or 0 is used for default options:
1159 * Case-sensitive comparison in code unit order, and the input strings
1160 * are quick-checked for FCD.
1163 * Set if the caller knows that both s1 and s2 fulfill the FCD
1164 * conditions. If not set, the function will quickCheck for FCD
1165 * and normalize if necessary.
1167 * - COMPARE_CODE_POINT_ORDER
1168 * Set to choose code point order instead of code unit order
1170 * - COMPARE_IGNORE_CASE
1171 * Set to compare strings case-insensitively using case folding,
1172 * instead of case-sensitively.
1173 * If set, then the following case folding options are used.
1175 * @return <0 or 0 or >0 as usual for string comparisons
1181 public static int compare(char[] s1, char[] s2, int options) {
1182 return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1186 * Convenience method that can have faster implementation
1187 * by not allocating buffers.
1188 * @param char32a the first code point to be checked against the
1189 * @param char32b the second code point
1190 * @param options A bit set of options
1193 public static int compare(int char32a, int char32b, int options) {
1194 return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1198 * Convenience method that can have faster implementation
1199 * by not allocating buffers.
1200 * @param char32a the first code point to be checked against
1201 * @param str2 the second string
1202 * @param options A bit set of options
1205 public static int compare(int char32a, String str2, int options) {
1206 return internalCompare(UTF16.valueOf(char32a), str2, options);
1209 /* Concatenation of normalized strings --------------------------------- */
1211 * Concatenate normalized strings, making sure that the result is normalized
1214 * If both the left and the right strings are in
1215 * the normalization form according to "mode",
1216 * then the result will be
1219 * dest=normalize(left+right, mode)
1222 * With the input strings already being normalized,
1223 * this function will use next() and previous()
1224 * to find the adjacent end pieces of the input strings.
1225 * Only the concatenation of these end pieces will be normalized and
1226 * then concatenated with the remaining parts of the input strings.
1228 * It is allowed to have dest==left to avoid copying the entire left string.
1230 * @param left Left source array, may be same as dest.
1231 * @param leftStart start in the left array.
1232 * @param leftLimit limit in the left array (==length)
1233 * @param right Right source array.
1234 * @param rightStart start in the right array.
1235 * @param rightLimit limit in the right array (==length)
1236 * @param dest The output buffer; can be null if destStart==destLimit==0
1237 * for pure preflighting.
1238 * @param destStart start in the destination array
1239 * @param destLimit limit in the destination array (==length)
1240 * @param mode The normalization mode.
1241 * @param options The normalization options, ORed together (0 for no options).
1242 * @return Length of output (number of chars) when successful or
1243 * IndexOutOfBoundsException
1244 * @exception IndexOutOfBoundsException whose message has the string
1245 * representation of destination capacity required.
1249 * @exception IndexOutOfBoundsException if target capacity is less than the
1253 public static int concatenate(char[] left, int leftStart, int leftLimit,
1254 char[] right, int rightStart, int rightLimit,
1255 char[] dest, int destStart, int destLimit,
1256 Normalizer.Mode mode, int options) {
1258 throw new IllegalArgumentException();
1261 /* check for overlapping right and destination */
1262 if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1263 throw new IllegalArgumentException("overlapping right and dst ranges");
1266 /* allow left==dest */
1267 StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1268 destBuilder.append(left, leftStart, leftLimit-leftStart);
1269 CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1270 mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1271 int destLength=destBuilder.length();
1272 if(destLength<=(destLimit-destStart)) {
1273 destBuilder.getChars(0, destLength, dest, destStart);
1276 throw new IndexOutOfBoundsException(Integer.toString(destLength));
1281 * Concatenate normalized strings, making sure that the result is normalized
1284 * If both the left and the right strings are in
1285 * the normalization form according to "mode",
1286 * then the result will be
1289 * dest=normalize(left+right, mode)
1292 * For details see concatenate
1294 * @param left Left source string.
1295 * @param right Right source string.
1296 * @param mode The normalization mode.
1297 * @param options The normalization options, ORed together (0 for no options).
1307 public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1308 StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1309 return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1313 * Concatenate normalized strings, making sure that the result is normalized
1316 * If both the left and the right strings are in
1317 * the normalization form according to "mode",
1318 * then the result will be
1321 * dest=normalize(left+right, mode)
1324 * With the input strings already being normalized,
1325 * this function will use next() and previous()
1326 * to find the adjacent end pieces of the input strings.
1327 * Only the concatenation of these end pieces will be normalized and
1328 * then concatenated with the remaining parts of the input strings.
1330 * @param left Left source string.
1331 * @param right Right source string.
1332 * @param mode The normalization mode.
1333 * @param options The normalization options, ORed together (0 for no options).
1343 public static String concatenate(String left, String right, Mode mode, int options) {
1344 StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1345 return mode.getNormalizer2(options).append(dest, right).toString();
1349 * Gets the FC_NFKC closure value.
1350 * @param c The code point whose closure value is to be retrieved
1351 * @param dest The char array to receive the closure value
1352 * @return the length of the closure value; 0 if there is none
1355 public static int getFC_NFKC_Closure(int c,char[] dest) {
1356 String closure=getFC_NFKC_Closure(c);
1357 int length=closure.length();
1358 if(length!=0 && dest!=null && length<=dest.length) {
1359 closure.getChars(0, length, dest, 0);
1364 * Gets the FC_NFKC closure value.
1365 * @param c The code point whose closure value is to be retrieved
1366 * @return String representation of the closure value; "" if there is none
1369 public static String getFC_NFKC_Closure(int c) {
1370 // Compute the FC_NFKC_Closure on the fly:
1371 // We have the API for complete coverage of Unicode properties, although
1372 // this value by itself is not useful via API.
1373 // (What could be useful is a custom normalization table that combines
1374 // case folding and NFKC.)
1375 // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1376 Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1377 UCaseProps csp=UCaseProps.INSTANCE;
1378 // first: b = NFKC(Fold(a))
1379 StringBuilder folded=new StringBuilder();
1380 int folded1Length=csp.toFullFolding(c, folded, 0);
1381 if(folded1Length<0) {
1382 Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1383 if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1384 return ""; // c does not change at all under CaseFolding+NFKC
1386 folded.appendCodePoint(c);
1388 if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1389 folded.appendCodePoint(folded1Length);
1392 String kc1=nfkc.normalize(folded);
1393 // second: c = NFKC(Fold(b))
1394 String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1395 // if (c != b) add the mapping from a to c
1396 if(kc1.equals(kc2)) {
1403 //-------------------------------------------------------------------------
1405 //-------------------------------------------------------------------------
1408 * Return the current character in the normalized text.
1409 * @return The codepoint as an int
1412 public int current() {
1413 if(bufferPos<buffer.length() || nextNormalize()) {
1414 return buffer.codePointAt(bufferPos);
1421 * Return the next character in the normalized text and advance
1422 * the iteration position by one. If the end
1423 * of the text has already been reached, {@link #DONE} is returned.
1424 * @return The codepoint as an int
1428 if(bufferPos<buffer.length() || nextNormalize()) {
1429 int c=buffer.codePointAt(bufferPos);
1430 bufferPos+=Character.charCount(c);
1439 * Return the previous character in the normalized text and decrement
1440 * the iteration position by one. If the beginning
1441 * of the text has already been reached, {@link #DONE} is returned.
1442 * @return The codepoint as an int
1445 public int previous() {
1446 if(bufferPos>0 || previousNormalize()) {
1447 int c=buffer.codePointBefore(bufferPos);
1448 bufferPos-=Character.charCount(c);
1456 * Reset the index to the beginning of the text.
1457 * This is equivalent to setIndexOnly(startIndex)).
1460 public void reset() {
1462 currentIndex=nextIndex=0;
1467 * Set the iteration position in the input text that is being normalized,
1468 * without any immediate normalization.
1469 * After setIndexOnly(), getIndex() will return the same index that is
1472 * @param index the desired index in the input text.
1475 public void setIndexOnly(int index) {
1476 text.setIndex(index); // validates index
1477 currentIndex=nextIndex=index;
1482 * Set the iteration position in the input text that is being normalized
1483 * and return the first normalized character at that position.
1485 * <b>Note:</b> This method sets the position in the <em>input</em> text,
1486 * while {@link #next} and {@link #previous} iterate through characters
1487 * in the normalized <em>output</em>. This means that there is not
1488 * necessarily a one-to-one correspondence between characters returned
1489 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1490 * returned from <tt>setIndex</tt> and {@link #getIndex}.
1492 * @param index the desired index in the input text.
1494 * @return the first normalized character that is the result of iterating
1495 * forward starting at the given index.
1497 * @throws IllegalArgumentException if the given index is less than
1498 * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1499 * @deprecated ICU 3.2
1503 public int setIndex(int index) {
1504 setIndexOnly(index);
1509 * Retrieve the index of the start of the input text. This is the begin
1510 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1511 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1512 * @deprecated ICU 2.2. Use startIndex() instead.
1513 * @return The codepoint as an int
1516 public int getBeginIndex() {
1521 * Retrieve the index of the end of the input text. This is the end index
1522 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1523 * over which this <tt>Normalizer</tt> is iterating
1524 * @deprecated ICU 2.2. Use endIndex() instead.
1525 * @return The codepoint as an int
1528 public int getEndIndex() {
1532 * Return the first character in the normalized text. This resets
1533 * the <tt>Normalizer's</tt> position to the beginning of the text.
1534 * @return The codepoint as an int
1537 public int first() {
1543 * Return the last character in the normalized text. This resets
1544 * the <tt>Normalizer's</tt> position to be just before the
1545 * the input text corresponding to that normalized character.
1546 * @return The codepoint as an int
1551 currentIndex=nextIndex=text.getIndex();
1557 * Retrieve the current iteration position in the input text that is
1558 * being normalized. This method is useful in applications such as
1559 * searching, where you need to be able to determine the position in
1560 * the input text that corresponds to a given normalized output character.
1562 * <b>Note:</b> This method sets the position in the <em>input</em>, while
1563 * {@link #next} and {@link #previous} iterate through characters in the
1564 * <em>output</em>. This means that there is not necessarily a one-to-one
1565 * correspondence between characters returned by <tt>next</tt> and
1566 * <tt>previous</tt> and the indices passed to and returned from
1567 * <tt>setIndex</tt> and {@link #getIndex}.
1568 * @return The current iteration position
1571 public int getIndex() {
1572 if(bufferPos<buffer.length()) {
1573 return currentIndex;
1580 * Retrieve the index of the start of the input text. This is the begin
1581 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1582 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1583 * @return The current iteration position
1586 public int startIndex() {
1591 * Retrieve the index of the end of the input text. This is the end index
1592 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1593 * over which this <tt>Normalizer</tt> is iterating
1594 * @return The current iteration position
1597 public int endIndex() {
1598 return text.getLength();
1601 //-------------------------------------------------------------------------
1602 // Iterator attributes
1603 //-------------------------------------------------------------------------
1605 * Set the normalization mode for this object.
1607 * <b>Note:</b>If the normalization mode is changed while iterating
1608 * over a string, calls to {@link #next} and {@link #previous} may
1609 * return previously buffers characters in the old normalization mode
1610 * until the iteration is able to re-sync at the next base character.
1611 * It is safest to call {@link #setText setText()}, {@link #first},
1612 * {@link #last}, etc. after calling <tt>setMode</tt>.
1614 * @param newMode the new mode for this <tt>Normalizer</tt>.
1615 * The supported modes are:
1617 * <li>{@link #NFC} - Unicode canonical decompositiion
1618 * followed by canonical composition.
1619 * <li>{@link #NFKC} - Unicode compatibility decompositiion
1620 * follwed by canonical composition.
1621 * <li>{@link #NFD} - Unicode canonical decomposition
1622 * <li>{@link #NFKD} - Unicode compatibility decomposition.
1623 * <li>{@link #NONE} - Do nothing but return characters
1624 * from the underlying input text.
1630 public void setMode(Mode newMode) {
1632 norm2 = mode.getNormalizer2(options);
1635 * Return the basic operation performed by this <tt>Normalizer</tt>
1640 public Mode getMode() {
1644 * Set options that affect this <tt>Normalizer</tt>'s operation.
1645 * Options do not change the basic composition or decomposition operation
1646 * that is being performed , but they control whether
1647 * certain optional portions of the operation are done.
1648 * Currently the only available option is:
1651 * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1654 * @param option the option whose value is to be set.
1655 * @param value the new setting for the option. Use <tt>true</tt> to
1656 * turn the option on and <tt>false</tt> to turn it off.
1661 public void setOption(int option,boolean value) {
1665 options &= (~option);
1667 norm2 = mode.getNormalizer2(options);
1671 * Determine whether an option is turned on or off.
1676 public int getOption(int option) {
1677 if((options & option)!=0) {
1685 * Gets the underlying text storage
1686 * @param fillIn the char buffer to fill the UTF-16 units.
1687 * The length of the buffer should be equal to the length of the
1688 * underlying text storage
1689 * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1693 public int getText(char[] fillIn) {
1694 return text.getText(fillIn);
1698 * Gets the length of underlying text storage
1699 * @return the length
1702 public int getLength() {
1703 return text.getLength();
1707 * Returns the text under iteration as a string
1708 * @return a copy of the text under iteration.
1711 public String getText() {
1712 return text.getText();
1716 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1717 * The iteration position is set to the beginning of the input text.
1718 * @param newText The new string to be normalized.
1721 public void setText(StringBuffer newText) {
1722 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1723 if (newIter == null) {
1724 throw new IllegalStateException("Could not create a new UCharacterIterator");
1731 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1732 * The iteration position is set to the beginning of the input text.
1733 * @param newText The new string to be normalized.
1736 public void setText(char[] newText) {
1737 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1738 if (newIter == null) {
1739 throw new IllegalStateException("Could not create a new UCharacterIterator");
1746 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1747 * The iteration position is set to the beginning of the input text.
1748 * @param newText The new string to be normalized.
1751 public void setText(String newText) {
1752 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1753 if (newIter == null) {
1754 throw new IllegalStateException("Could not create a new UCharacterIterator");
1761 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1762 * The iteration position is set to the beginning of the input text.
1763 * @param newText The new string to be normalized.
1766 public void setText(CharacterIterator newText) {
1767 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1768 if (newIter == null) {
1769 throw new IllegalStateException("Could not create a new UCharacterIterator");
1776 * Set the input text over which this <tt>Normalizer</tt> will iterate.
1777 * The iteration position is set to the beginning of the string.
1778 * @param newText The new string to be normalized.
1781 public void setText(UCharacterIterator newText) {
1783 UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1784 if (newIter == null) {
1785 throw new IllegalStateException("Could not create a new UCharacterIterator");
1789 }catch(CloneNotSupportedException e) {
1790 throw new IllegalStateException("Could not clone the UCharacterIterator");
1794 private void clearBuffer() {
1795 buffer.setLength(0);
1799 private boolean nextNormalize() {
1801 currentIndex=nextIndex;
1802 text.setIndex(nextIndex);
1803 // Skip at least one character so we make progress.
1804 int c=text.nextCodePoint();
1808 StringBuilder segment=new StringBuilder().appendCodePoint(c);
1809 while((c=text.nextCodePoint())>=0) {
1810 if(norm2.hasBoundaryBefore(c)) {
1811 text.moveCodePointIndex(-1);
1814 segment.appendCodePoint(c);
1816 nextIndex=text.getIndex();
1817 norm2.normalize(segment, buffer);
1818 return buffer.length()!=0;
1821 private boolean previousNormalize() {
1823 nextIndex=currentIndex;
1824 text.setIndex(currentIndex);
1825 StringBuilder segment=new StringBuilder();
1827 while((c=text.previousCodePoint())>=0) {
1829 segment.insert(0, (char)c);
1831 segment.insert(0, Character.toChars(c));
1833 if(norm2.hasBoundaryBefore(c)) {
1837 currentIndex=text.getIndex();
1838 norm2.normalize(segment, buffer);
1839 bufferPos=buffer.length();
1840 return buffer.length()!=0;
1843 /* compare canonically equivalent ------------------------------------------- */
1845 // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
1846 private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1847 int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1848 options|= COMPARE_EQUIV;
1851 * UAX #21 Case Mappings, as fixed for Unicode version 4
1852 * (see Jitterbug 2021), defines a canonical caseless match as
1854 * A string X is a canonical caseless match
1855 * for a string Y if and only if
1856 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1858 * For better performance, we check for FCD (or let the caller tell us that
1859 * both strings are in FCD) for the inner normalization.
1860 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
1861 * case-folding preserves the FCD-ness of a string.
1862 * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
1863 * when there is a difference.
1865 * Exception: When using the Turkic case-folding option, we do perform
1866 * full NFD first. This is because in the Turkic case precomposed characters
1867 * with 0049 capital I or 0069 small i fold differently whether they
1868 * are first decomposed or not, so an FCD check - a check only for
1869 * canonical order - is not sufficient.
1871 if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1873 if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1874 n2=NFD.getNormalizer2(normOptions);
1876 n2=FCD.getNormalizer2(normOptions);
1879 // check if s1 and/or s2 fulfill the FCD conditions
1880 int spanQCYes1=n2.spanQuickCheckYes(s1);
1881 int spanQCYes2=n2.spanQuickCheckYes(s2);
1884 * ICU 2.4 had a further optimization:
1885 * If both strings were not in FCD, then they were both NFD'ed,
1886 * and the COMPARE_EQUIV option was turned off.
1887 * It is not entirely clear that this is valid with the current
1888 * definition of the canonical caseless match.
1889 * Therefore, ICU 2.6 removes that optimization.
1892 if(spanQCYes1<s1.length()) {
1893 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
1894 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
1896 if(spanQCYes2<s2.length()) {
1897 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
1898 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
1902 return cmpEquivFold(s1, s2, options);
1906 * Compare two strings for canonical equivalence.
1907 * Further options include case-insensitive comparison and
1908 * code point order (as opposed to code unit order).
1910 * In this function, canonical equivalence is optional as well.
1911 * If canonical equivalence is tested, then both strings must fulfill
1914 * Semantically, this is equivalent to
1915 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
1916 * where code point order, NFD and foldCase are all optional.
1918 * String comparisons almost always yield results before processing both strings
1920 * They are generally more efficient working incrementally instead of
1921 * performing the sub-processing (strlen, normalization, case-folding)
1922 * on the entire strings first.
1924 * It is also unnecessary to not normalize identical characters.
1926 * This function works in principle as follows:
1929 * get one code unit c1 from s1 (-1 if end of source)
1930 * get one code unit c2 from s2 (-1 if end of source)
1932 * if(either string finished) {
1940 * try to decompose/case-fold c1/c2, and continue if one does;
1942 * // still c1!=c2 and neither decomposes/case-folds, return result
1946 * When a character decomposes, then the pointer for that source changes to
1947 * the decomposition, pushing the previous pointer onto a stack.
1948 * When the end of the decomposition is reached, then the code unit reader
1949 * pops the previous source from the stack.
1950 * (Same for case-folding.)
1952 * This is complicated further by operating on variable-width UTF-16.
1953 * The top part of the loop works on code units, while lookups for decomposition
1954 * and case-folding need code points.
1955 * Code points are assembled after the equality/end-of-source part.
1956 * The source pointer is only advanced beyond all code units when the code point
1957 * actually decomposes/case-folds.
1959 * If we were on a trail surrogate unit when assembling a code point,
1960 * and the code point decomposes/case-folds, then the decomposition/folding
1961 * result must be compared with the part of the other string that corresponds to
1962 * this string's lead surrogate.
1963 * Since we only assemble a code point when hitting a trail unit when the
1964 * preceding lead units were identical, we back up the other string by one unit
1967 * The optional code point order comparison at the end works with
1968 * the same fix-up as the other code point order comparison functions.
1969 * See ustring.c and the comment near the end of this function.
1971 * Assumption: A decomposition or case-folding result string never contains
1972 * a single surrogate. This is a safe assumption in the Unicode Standard.
1973 * Therefore, we do not need to check for surrogate pairs across
1974 * decomposition/case-folding boundaries.
1976 * Further assumptions (see verifications tstnorm.cpp):
1977 * The API function checks for FCD first, while the core function
1978 * first case-folds and then decomposes. This requires that case-folding does not
1979 * un-FCD any strings.
1981 * The API function may also NFD the input and turn off decomposition.
1982 * This requires that case-folding does not un-NFD strings either.
1984 * TODO If any of the above two assumptions is violated,
1985 * then this entire code must be re-thought.
1986 * If this happens, then a simple solution is to case-fold both strings up front
1987 * and to turn off UNORM_INPUT_IS_FCD.
1988 * We already do this when not both strings are in FCD because makeFCD
1989 * would be a partial NFD before the case folding, which does not work.
1990 * Note that all of this is only a problem when case-folding _and_
1991 * canonical equivalence come together.
1992 * (Comments in unorm_compare() are more up to date than this TODO.)
1995 /* stack element for previous-level source/decomposition pointers */
1996 private static final class CmpEquivLevel {
2000 private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2001 return new CmpEquivLevel[] {
2002 new CmpEquivLevel(), new CmpEquivLevel()
2007 * Internal option for unorm_cmpEquivFold() for decomposing.
2008 * If not set, just do strcasecmp().
2010 private static final int COMPARE_EQUIV=0x80000;
2012 /* internal function; package visibility for use by UTF16.StringComparator */
2013 /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2014 Normalizer2Impl nfcImpl;
2017 /* current-level start/limit - s1/s2 as current */
2018 int s1, s2, limit1, limit2;
2020 /* decomposition and case folding variables */
2023 /* stacks of previous-level start/current/limit */
2024 CmpEquivLevel[] stack1=null, stack2=null;
2026 /* buffers for algorithmic decompositions */
2027 String decomp1, decomp2;
2029 /* case folding buffers, only use current-level start/limit */
2030 StringBuilder fold1, fold2;
2032 /* track which is the current level per string */
2035 /* current code units, and code points for lookups */
2036 int c1, c2, cp1, cp2;
2038 /* no argument error checking because this itself is not an API */
2041 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2042 * otherwise this function must behave exactly as uprv_strCompare()
2043 * not checking for that here makes testing this function easier
2046 /* normalization/properties data loaded? */
2047 if((options&COMPARE_EQUIV)!=0) {
2048 nfcImpl=Norm2AllModes.getNFCInstance().impl;
2052 if((options&COMPARE_IGNORE_CASE)!=0) {
2053 csp=UCaseProps.INSTANCE;
2054 fold1=new StringBuilder();
2055 fold2=new StringBuilder();
2063 limit1=cs1.length();
2065 limit2=cs2.length();
2070 /* comparison loop */
2073 * here a code unit value of -1 means "get another code unit"
2074 * below it will mean "this source is finished"
2078 /* get next code unit from string 1, post-increment */
2086 c1=cs1.charAt(s1++);
2090 /* reached end of level buffer, pop one level */
2093 cs1=stack1[level1].cs;
2095 s1=stack1[level1].s;
2096 limit1=cs1.length();
2101 /* get next code unit from string 2, post-increment */
2109 c2=cs2.charAt(s2++);
2113 /* reached end of level buffer, pop one level */
2116 cs2=stack2[level2].cs;
2118 s2=stack2[level2].s;
2119 limit2=cs2.length();
2125 * either variable c1, c2 is -1 only if the corresponding string is finished
2129 return 0; /* c1==c2==-1 indicating end of strings */
2131 c1=c2=-1; /* make us fetch new code units */
2134 return -1; /* string 1 ends before string 2 */
2136 return 1; /* string 2 ends before string 1 */
2138 /* c1!=c2 && c1>=0 && c2>=0 */
2140 /* get complete code points for c1, c2 for lookups if either is a surrogate */
2142 if(UTF16.isSurrogate((char)c1)) {
2145 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2146 if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2147 /* advance ++s1; only below if cp1 decomposes/case-folds */
2148 cp1=Character.toCodePoint((char)c1, c);
2150 } else /* isTrail(c1) */ {
2151 if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2152 cp1=Character.toCodePoint(c, (char)c1);
2158 if(UTF16.isSurrogate((char)c2)) {
2161 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2162 if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2163 /* advance ++s2; only below if cp2 decomposes/case-folds */
2164 cp2=Character.toCodePoint((char)c2, c);
2166 } else /* isTrail(c2) */ {
2167 if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2168 cp2=Character.toCodePoint(c, (char)c2);
2174 * go down one level for each string
2175 * continue with the main loop as soon as there is a real change
2178 if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2179 (length=csp.toFullFolding(cp1, fold1, options))>=0
2181 /* cp1 case-folds to the code point "length" or to p[length] */
2182 if(UTF16.isSurrogate((char)c1)) {
2183 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2184 /* advance beyond source surrogate pair if it case-folds */
2186 } else /* isTrail(c1) */ {
2188 * we got a supplementary code point when hitting its trail surrogate,
2189 * therefore the lead surrogate must have been the same as in the other string;
2190 * compare this decomposition with the lead surrogate in the other string
2191 * remember that this simulates bulk text replacement:
2192 * the decomposition would replace the entire code point
2195 c2=cs2.charAt(s2-1);
2199 /* push current level pointers */
2201 stack1=createCmpEquivLevelStack();
2207 /* copy the folding result to fold1[] */
2208 /* Java: the buffer was probably not empty, remove the old contents */
2209 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2210 fold1.delete(0, fold1.length()-length);
2213 fold1.appendCodePoint(length);
2216 /* set next level pointers to case folding */
2219 limit1=fold1.length();
2221 /* get ready to read from decomposition, continue with loop */
2226 if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2227 (length=csp.toFullFolding(cp2, fold2, options))>=0
2229 /* cp2 case-folds to the code point "length" or to p[length] */
2230 if(UTF16.isSurrogate((char)c2)) {
2231 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2232 /* advance beyond source surrogate pair if it case-folds */
2234 } else /* isTrail(c2) */ {
2236 * we got a supplementary code point when hitting its trail surrogate,
2237 * therefore the lead surrogate must have been the same as in the other string;
2238 * compare this decomposition with the lead surrogate in the other string
2239 * remember that this simulates bulk text replacement:
2240 * the decomposition would replace the entire code point
2243 c1=cs1.charAt(s1-1);
2247 /* push current level pointers */
2249 stack2=createCmpEquivLevelStack();
2255 /* copy the folding result to fold2[] */
2256 /* Java: the buffer was probably not empty, remove the old contents */
2257 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2258 fold2.delete(0, fold2.length()-length);
2261 fold2.appendCodePoint(length);
2264 /* set next level pointers to case folding */
2267 limit2=fold2.length();
2269 /* get ready to read from decomposition, continue with loop */
2274 if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2275 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2277 /* cp1 decomposes into p[length] */
2278 if(UTF16.isSurrogate((char)c1)) {
2279 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2280 /* advance beyond source surrogate pair if it decomposes */
2282 } else /* isTrail(c1) */ {
2284 * we got a supplementary code point when hitting its trail surrogate,
2285 * therefore the lead surrogate must have been the same as in the other string;
2286 * compare this decomposition with the lead surrogate in the other string
2287 * remember that this simulates bulk text replacement:
2288 * the decomposition would replace the entire code point
2291 c2=cs2.charAt(s2-1);
2295 /* push current level pointers */
2297 stack1=createCmpEquivLevelStack();
2299 stack1[level1].cs=cs1;
2300 stack1[level1].s=s1;
2303 /* set empty intermediate level if skipped */
2305 stack1[level1++].cs=null;
2308 /* set next level pointers to decomposition */
2311 limit1=decomp1.length();
2313 /* get ready to read from decomposition, continue with loop */
2318 if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2319 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2321 /* cp2 decomposes into p[length] */
2322 if(UTF16.isSurrogate((char)c2)) {
2323 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2324 /* advance beyond source surrogate pair if it decomposes */
2326 } else /* isTrail(c2) */ {
2328 * we got a supplementary code point when hitting its trail surrogate,
2329 * therefore the lead surrogate must have been the same as in the other string;
2330 * compare this decomposition with the lead surrogate in the other string
2331 * remember that this simulates bulk text replacement:
2332 * the decomposition would replace the entire code point
2335 c1=cs1.charAt(s1-1);
2339 /* push current level pointers */
2341 stack2=createCmpEquivLevelStack();
2343 stack2[level2].cs=cs2;
2344 stack2[level2].s=s2;
2347 /* set empty intermediate level if skipped */
2349 stack2[level2++].cs=null;
2352 /* set next level pointers to decomposition */
2355 limit2=decomp2.length();
2357 /* get ready to read from decomposition, continue with loop */
2363 * no decomposition/case folding, max level for both sides:
2364 * return difference result
2366 * code point order comparison must not just return cp1-cp2
2367 * because when single surrogates are present then the surrogate pairs
2368 * that formed cp1 and cp2 may be from different string indexes
2370 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2371 * c1=d800 cp1=10001 c2=dc00 cp2=10000
2372 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2374 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2375 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2376 * so we have slightly different pointer/start/limit comparisons here
2379 if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2380 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2382 (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2383 (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2385 /* part of a surrogate pair, leave >=d800 */
2387 /* BMP code point - may be surrogate code point - make <d800 */
2392 (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2393 (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2395 /* part of a surrogate pair, leave >=d800 */
2397 /* BMP code point - may be surrogate code point - make <d800 */
2407 * An Appendable that writes into a char array with a capacity that may be
2408 * less than array.length.
2409 * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2411 * An overflow is only reported at the end, for the old Normalizer API functions that write
2414 private static final class CharsAppendable implements Appendable {
2415 public CharsAppendable(char[] dest, int destStart, int destLimit) {
2417 start=offset=destStart;
2420 public int length() {
2421 int len=offset-start;
2425 throw new IndexOutOfBoundsException(Integer.toString(len));
2428 public Appendable append(char c) {
2435 public Appendable append(CharSequence s) {
2436 return append(s, 0, s.length());
2438 public Appendable append(CharSequence s, int sStart, int sLimit) {
2439 int len=sLimit-sStart;
2440 if(len<=(limit-offset)) {
2441 while(sStart<sLimit) { // TODO: Is there a better way to copy the characters?
2442 chars[offset++]=s.charAt(sStart++);
2450 private final char[] chars;
2451 private final int start, limit;