2 *******************************************************************************
\r
3 * Copyright (C) 2000-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
8 import java.io.IOException;
\r
9 import java.nio.CharBuffer;
\r
10 import java.text.CharacterIterator;
\r
12 import com.ibm.icu.impl.Norm2AllModes;
\r
13 import com.ibm.icu.impl.Normalizer2Impl;
\r
14 import com.ibm.icu.impl.UCaseProps;
\r
15 import com.ibm.icu.lang.UCharacter;
\r
18 * Unicode Normalization
\r
20 * <h2>Unicode normalization API</h2>
\r
22 * <code>normalize</code> transforms Unicode text into an equivalent composed or
\r
23 * decomposed form, allowing for easier sorting and searching of text.
\r
24 * <code>normalize</code> supports the standard normalization forms described in
\r
25 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
\r
26 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
\r
28 * Characters with accents or other adornments can be encoded in
\r
29 * several different ways in Unicode. For example, take the character A-acute.
\r
30 * In Unicode, this can be encoded as a single character (the
\r
34 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE
\r
37 * or as two separate characters (the "decomposed" form):
\r
40 * 0041 LATIN CAPITAL LETTER A
\r
41 * 0301 COMBINING ACUTE ACCENT
\r
44 * To a user of your program, however, both of these sequences should be
\r
45 * treated as the same "user-level" character "A with acute accent". When you
\r
46 * are searching or comparing text, you must ensure that these two sequences are
\r
47 * treated equivalently. In addition, you must handle characters with more than
\r
48 * one accent. Sometimes the order of a character's combining accents is
\r
49 * significant, while in other cases accent sequences in different orders are
\r
50 * really equivalent.
\r
52 * Similarly, the string "ffi" can be encoded as three separate letters:
\r
55 * 0066 LATIN SMALL LETTER F
\r
56 * 0066 LATIN SMALL LETTER F
\r
57 * 0069 LATIN SMALL LETTER I
\r
60 * or as the single character
\r
63 * FB03 LATIN SMALL LIGATURE FFI
\r
66 * The ffi ligature is not a distinct semantic character, and strictly speaking
\r
67 * it shouldn't be in Unicode at all, but it was included for compatibility
\r
68 * with existing character sets that already provided it. The Unicode standard
\r
69 * identifies such characters by giving them "compatibility" decompositions
\r
70 * into the corresponding semantic characters. When sorting and searching, you
\r
71 * will often want to use these mappings.
\r
73 * <code>normalize</code> helps solve these problems by transforming text into
\r
74 * the canonical composed and decomposed forms as shown in the first example
\r
75 * above. In addition, you can have it perform compatibility decompositions so
\r
76 * that you can treat compatibility characters the same as their equivalents.
\r
77 * Finally, <code>normalize</code> rearranges accents into the proper canonical
\r
78 * order, so that you do not have to worry about accent rearrangement on your
\r
81 * Form FCD, "Fast C or D", is also designed for collation.
\r
82 * It allows to work on strings that are not necessarily normalized
\r
83 * with an algorithm (like in collation) that works under "canonical closure",
\r
84 * i.e., it treats precomposed characters and their decomposed equivalents the
\r
87 * It is not a normalization form because it does not provide for uniqueness of
\r
88 * representation. Multiple strings may be canonically equivalent (their NFDs
\r
89 * are identical) and may all conform to FCD without being identical themselves.
\r
91 * The form is defined such that the "raw decomposition", the recursive
\r
92 * canonical decomposition of each character, results in a string that is
\r
93 * canonically ordered. This means that precomposed characters are allowed for
\r
94 * as long as their decompositions do not need canonical reordering.
\r
96 * Its advantage for a process like collation is that all NFD and most NFC texts
\r
97 * - and many unnormalized texts - already conform to FCD and do not need to be
\r
98 * normalized (NFD) for such a process. The FCD quick check will return YES for
\r
99 * most strings in practice.
\r
101 * normalize(FCD) may be implemented with NFD.
\r
103 * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
\r
104 * http://www.unicode.org/notes/tn5/#FCD
\r
106 * ICU collation performs either NFD or FCD normalization automatically if
\r
107 * normalization is turned on for the collator object. Beyond collation and
\r
108 * string search, normalized strings may be useful for string equivalence
\r
109 * comparisons, transliteration/transcription, unique representations, etc.
\r
111 * The W3C generally recommends to exchange texts in NFC.
\r
112 * Note also that most legacy character encodings use only precomposed forms and
\r
113 * often do not encode any combining marks by themselves. For conversion to such
\r
114 * character encodings the Unicode text needs to be normalized to NFC.
\r
115 * For more usage examples, see the Unicode Standard Annex.
\r
117 * Note: The Normalizer class also provides API for iterative normalization.
\r
118 * While the setIndex() and getIndex() refer to indices in the
\r
119 * underlying Unicode input text, the next() and previous() methods
\r
120 * iterate through characters in the normalized output.
\r
121 * This means that there is not necessarily a one-to-one correspondence
\r
122 * between characters returned by next() and previous() and the indices
\r
123 * passed to and returned from setIndex() and getIndex().
\r
124 * It is for this reason that Normalizer does not implement the CharacterIterator interface.
\r
128 public final class Normalizer implements Cloneable {
\r
129 // The input text and our position in it
\r
130 private UCharacterIterator text;
\r
131 private Normalizer2 norm2;
\r
133 private int options;
\r
135 // The normalization buffer is the result of normalization
\r
136 // of the source in [currentIndex..nextIndex[ .
\r
137 private int currentIndex;
\r
138 private int nextIndex;
\r
140 // A buffer for holding intermediate results
\r
141 private StringBuilder buffer;
\r
142 private int bufferPos;
\r
144 // Helper classes to defer loading of normalization data.
\r
145 private static final class ModeImpl {
\r
146 private ModeImpl(Normalizer2 n2) {
\r
149 private final Normalizer2 normalizer2;
\r
151 private static final class NFDModeImpl {
\r
152 private static final ModeImpl INSTANCE =
\r
153 new ModeImpl(Norm2AllModes.getNFCInstance().decomp);
\r
155 private static final class NFKDModeImpl {
\r
156 private static final ModeImpl INSTANCE =
\r
157 new ModeImpl(Norm2AllModes.getNFKCInstance().decomp);
\r
159 private static final class NFCModeImpl {
\r
160 private static final ModeImpl INSTANCE =
\r
161 new ModeImpl(Norm2AllModes.getNFCInstance().comp);
\r
163 private static final class NFKCModeImpl {
\r
164 private static final ModeImpl INSTANCE =
\r
165 new ModeImpl(Norm2AllModes.getNFKCInstance().comp);
\r
167 private static final class FCDModeImpl {
\r
168 private static final ModeImpl INSTANCE =
\r
169 new ModeImpl(Norm2AllModes.getFCDNormalizer2());
\r
172 private static final class Unicode32 {
\r
173 private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
\r
175 private static final class NFD32ModeImpl {
\r
176 private static final ModeImpl INSTANCE =
\r
177 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().decomp,
\r
178 Unicode32.INSTANCE));
\r
180 private static final class NFKD32ModeImpl {
\r
181 private static final ModeImpl INSTANCE =
\r
182 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().decomp,
\r
183 Unicode32.INSTANCE));
\r
185 private static final class NFC32ModeImpl {
\r
186 private static final ModeImpl INSTANCE =
\r
187 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFCInstance().comp,
\r
188 Unicode32.INSTANCE));
\r
190 private static final class NFKC32ModeImpl {
\r
191 private static final ModeImpl INSTANCE =
\r
192 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getNFKCInstance().comp,
\r
193 Unicode32.INSTANCE));
\r
195 private static final class FCD32ModeImpl {
\r
196 private static final ModeImpl INSTANCE =
\r
197 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
\r
198 Unicode32.INSTANCE));
\r
202 * Options bit set value to select Unicode 3.2 normalization
\r
203 * (except NormalizationCorrections).
\r
204 * At most one Unicode version can be selected at a time.
\r
207 public static final int UNICODE_3_2=0x20;
\r
210 * Constant indicating that the end of the iteration has been reached.
\r
211 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
\r
214 public static final int DONE = UCharacterIterator.DONE;
\r
217 * Constants for normalization modes.
\r
219 * The Mode class is not intended for public subclassing.
\r
220 * Only the Mode constants provided by the Normalizer class should be used,
\r
221 * and any fields or methods should not be called or overridden by users.
\r
224 public static abstract class Mode {
\r
227 * @deprecated This API is ICU internal only.
\r
229 protected abstract Normalizer2 getNormalizer2(int options);
\r
232 private static final class NONEMode extends Mode {
\r
233 protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
\r
235 private static final class NFDMode extends Mode {
\r
236 protected Normalizer2 getNormalizer2(int options) {
\r
237 return (options&UNICODE_3_2) != 0 ?
\r
238 NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
\r
241 private static final class NFKDMode extends Mode {
\r
242 protected Normalizer2 getNormalizer2(int options) {
\r
243 return (options&UNICODE_3_2) != 0 ?
\r
244 NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
\r
247 private static final class NFCMode extends Mode {
\r
248 protected Normalizer2 getNormalizer2(int options) {
\r
249 return (options&UNICODE_3_2) != 0 ?
\r
250 NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
\r
253 private static final class NFKCMode extends Mode {
\r
254 protected Normalizer2 getNormalizer2(int options) {
\r
255 return (options&UNICODE_3_2) != 0 ?
\r
256 NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
\r
259 private static final class FCDMode extends Mode {
\r
260 protected Normalizer2 getNormalizer2(int options) {
\r
261 return (options&UNICODE_3_2) != 0 ?
\r
262 FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
\r
267 * No decomposition/composition.
\r
270 public static final Mode NONE = new NONEMode();
\r
273 * Canonical decomposition.
\r
276 public static final Mode NFD = new NFDMode();
\r
279 * Compatibility decomposition.
\r
282 public static final Mode NFKD = new NFKDMode();
\r
285 * Canonical decomposition followed by canonical composition.
\r
288 public static final Mode NFC = new NFCMode();
\r
291 * Default normalization.
\r
294 public static final Mode DEFAULT = NFC;
\r
297 * Compatibility decomposition followed by canonical composition.
\r
300 public static final Mode NFKC =new NFKCMode();
\r
303 * "Fast C or D" form.
\r
306 public static final Mode FCD = new FCDMode();
\r
309 * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
\r
310 * and the static {@link #normalize normalize} method. This value tells
\r
311 * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
\r
312 * from the underlying String or CharacterIterator. If you have code which
\r
313 * requires raw text at some times and normalized text at others, you can
\r
314 * use <tt>NO_OP</tt> for the cases where you want raw text, rather
\r
315 * than having a separate code path that bypasses <tt>Normalizer</tt>
\r
319 * @deprecated ICU 2.8. Use Nomalizer.NONE
\r
322 public static final Mode NO_OP = NONE;
\r
325 * Canonical decomposition followed by canonical composition. Used with the
\r
326 * {@link com.ibm.icu.text.Normalizer constructors} and the static
\r
327 * {@link #normalize normalize} method to determine the operation to be
\r
330 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
\r
331 * off, this operation produces output that is in
\r
332 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
\r
337 * @deprecated ICU 2.8. Use Normalier.NFC
\r
340 public static final Mode COMPOSE = NFC;
\r
343 * Compatibility decomposition followed by canonical composition.
\r
344 * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
\r
345 * {@link #normalize normalize} method to determine the operation to be
\r
348 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
\r
349 * off, this operation produces output that is in
\r
350 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
\r
355 * @deprecated ICU 2.8. Use Normalizer.NFKC
\r
358 public static final Mode COMPOSE_COMPAT = NFKC;
\r
361 * Canonical decomposition. This value is passed to the
\r
362 * {@link com.ibm.icu.text.Normalizer constructors} and the static
\r
363 * {@link #normalize normalize}
\r
364 * method to determine the operation to be performed.
\r
366 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
\r
367 * off, this operation produces output that is in
\r
368 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
\r
373 * @deprecated ICU 2.8. Use Normalizer.NFD
\r
376 public static final Mode DECOMP = NFD;
\r
379 * Compatibility decomposition. This value is passed to the
\r
380 * {@link com.ibm.icu.text.Normalizer constructors} and the static
\r
381 * {@link #normalize normalize}
\r
382 * method to determine the operation to be performed.
\r
384 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
\r
385 * off, this operation produces output that is in
\r
386 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
\r
391 * @deprecated ICU 2.8. Use Normalizer.NFKD
\r
394 public static final Mode DECOMP_COMPAT = NFKD;
\r
397 * Option to disable Hangul/Jamo composition and decomposition.
\r
398 * This option applies to Korean text,
\r
399 * which can be represented either in the Jamo alphabet or in Hangul
\r
400 * characters, which are really just two or three Jamo combined
\r
401 * into one visual glyph. Since Jamo takes up more storage space than
\r
402 * Hangul, applications that process only Hangul text may wish to turn
\r
403 * this option on when decomposing text.
\r
405 * The Unicode standard treates Hangul to Jamo conversion as a
\r
406 * canonical decomposition, so this option must be turned <b>off</b> if you
\r
407 * wish to transform strings into one of the standard
\r
408 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
\r
409 * Unicode Normalization Forms</a>.
\r
412 * @deprecated ICU 2.8. This option is no longer supported.
\r
414 public static final int IGNORE_HANGUL = 0x0001;
\r
417 * Result values for quickCheck().
\r
418 * For details see Unicode Technical Report 15.
\r
421 public static final class QuickCheckResult{
\r
422 //private int resultValue;
\r
423 private QuickCheckResult(int value) {
\r
424 //resultValue=value;
\r
428 * Indicates that string is not in the normalized format
\r
431 public static final QuickCheckResult NO = new QuickCheckResult(0);
\r
434 * Indicates that string is in the normalized format
\r
437 public static final QuickCheckResult YES = new QuickCheckResult(1);
\r
440 * Indicates it cannot be determined if string is in the normalized
\r
441 * format without further thorough checks.
\r
444 public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
\r
447 * Option bit for compare:
\r
448 * Case sensitively compare the strings
\r
451 public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT;
\r
454 * Option bit for compare:
\r
455 * Both input strings are assumed to fulfill FCD conditions.
\r
458 public static final int INPUT_IS_FCD = 0x20000;
\r
461 * Option bit for compare:
\r
462 * Perform case-insensitive comparison.
\r
465 public static final int COMPARE_IGNORE_CASE = 0x10000;
\r
468 * Option bit for compare:
\r
469 * Compare strings in code point order instead of code unit order.
\r
472 public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
\r
475 * Option value for case folding: exclude the mappings for dotted I
\r
476 * and dotless i marked with 'I' in CaseFolding.txt.
\r
479 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
\r
482 * Lowest-order bit number of compare() options bits corresponding to
\r
483 * normalization options bits.
\r
485 * The options parameter for compare() uses most bits for
\r
486 * itself and for various comparison and folding flags.
\r
487 * The most significant bits, however, are shifted down and passed on
\r
488 * to the normalization implementation.
\r
489 * (That is, from compare(..., options, ...),
\r
490 * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
\r
491 * internal normalization functions.)
\r
496 public static final int COMPARE_NORM_OPTIONS_SHIFT = 20;
\r
498 //-------------------------------------------------------------------------
\r
499 // Iterator constructors
\r
500 //-------------------------------------------------------------------------
\r
503 * Creates a new <tt>Normalizer</tt> object for iterating over the
\r
504 * normalized form of a given string.
\r
506 * The <tt>options</tt> parameter specifies which optional
\r
507 * <tt>Normalizer</tt> features are to be enabled for this object.
\r
509 * @param str The string to be normalized. The normalization
\r
510 * will start at the beginning of the string.
\r
512 * @param mode The normalization mode.
\r
514 * @param opt Any optional features to be enabled.
\r
515 * Currently the only available option is {@link #UNICODE_3_2}.
\r
516 * If you want the default behavior corresponding to one of the
\r
517 * standard Unicode Normalization Forms, use 0 for this argument.
\r
520 public Normalizer(String str, Mode mode, int opt) {
\r
521 this.text = UCharacterIterator.getInstance(str);
\r
524 norm2 = mode.getNormalizer2(opt);
\r
525 buffer = new StringBuilder();
\r
529 * Creates a new <tt>Normalizer</tt> object for iterating over the
\r
530 * normalized form of the given text.
\r
532 * @param iter The input text to be normalized. The normalization
\r
533 * will start at the beginning of the string.
\r
535 * @param mode The normalization mode.
\r
537 * @param opt Any optional features to be enabled.
\r
538 * Currently the only available option is {@link #UNICODE_3_2}.
\r
539 * If you want the default behavior corresponding to one of the
\r
540 * standard Unicode Normalization Forms, use 0 for this argument.
\r
543 public Normalizer(CharacterIterator iter, Mode mode, int opt) {
\r
544 this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
\r
546 this.options = opt;
\r
547 norm2 = mode.getNormalizer2(opt);
\r
548 buffer = new StringBuilder();
\r
552 * Creates a new <tt>Normalizer</tt> object for iterating over the
\r
553 * normalized form of the given text.
\r
555 * @param iter The input text to be normalized. The normalization
\r
556 * will start at the beginning of the string.
\r
558 * @param mode The normalization mode.
\r
559 * @param options The normalization options, ORed together (0 for no options).
\r
562 public Normalizer(UCharacterIterator iter, Mode mode, int options) {
\r
564 this.text = (UCharacterIterator)iter.clone();
\r
566 this.options = options;
\r
567 norm2 = mode.getNormalizer2(options);
\r
568 buffer = new StringBuilder();
\r
569 } catch (CloneNotSupportedException e) {
\r
570 throw new IllegalStateException(e.toString());
\r
575 * Clones this <tt>Normalizer</tt> object. All properties of this
\r
576 * object are duplicated in the new object, including the cloning of any
\r
577 * {@link CharacterIterator} that was passed in to the constructor
\r
578 * or to {@link #setText(CharacterIterator) setText}.
\r
579 * However, the text storage underlying
\r
580 * the <tt>CharacterIterator</tt> is not duplicated unless the
\r
581 * iterator's <tt>clone</tt> method does so.
\r
584 public Object clone() {
\r
586 Normalizer copy = (Normalizer) super.clone();
\r
587 copy.text = (UCharacterIterator) text.clone();
\r
589 copy.options = options;
\r
590 copy.norm2 = norm2;
\r
591 copy.buffer = new StringBuilder(buffer);
\r
592 copy.bufferPos = bufferPos;
\r
593 copy.currentIndex = currentIndex;
\r
594 copy.nextIndex = nextIndex;
\r
597 catch (CloneNotSupportedException e) {
\r
598 throw new IllegalStateException(e);
\r
602 //--------------------------------------------------------------------------
\r
603 // Static Utility methods
\r
604 //--------------------------------------------------------------------------
\r
606 private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
\r
607 return (compat ? NFKC : NFC).getNormalizer2(options);
\r
609 private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
\r
610 return (compat ? NFKD : NFD).getNormalizer2(options);
\r
614 * Compose a string.
\r
615 * The string will be composed to according to the specified mode.
\r
616 * @param str The string to compose.
\r
617 * @param compat If true the string will be composed according to
\r
618 * NFKC rules and if false will be composed according to
\r
620 * @return String The composed string
\r
623 public static String compose(String str, boolean compat) {
\r
624 return compose(str,compat,0);
\r
628 * Compose a string.
\r
629 * The string will be composed to according to the specified mode.
\r
630 * @param str The string to compose.
\r
631 * @param compat If true the string will be composed according to
\r
632 * NFKC rules and if false will be composed according to
\r
634 * @param options The only recognized option is UNICODE_3_2
\r
635 * @return String The composed string
\r
638 public static String compose(String str, boolean compat, int options) {
\r
639 return getComposeNormalizer2(compat, options).normalize(str);
\r
643 * Compose a string.
\r
644 * The string will be composed to according to the specified mode.
\r
645 * @param source The char array to compose.
\r
646 * @param target A char buffer to receive the normalized text.
\r
647 * @param compat If true the char array will be composed according to
\r
648 * NFKC rules and if false will be composed according to
\r
650 * @param options The normalization options, ORed together (0 for no options).
\r
651 * @return int The total buffer size needed;if greater than length of
\r
652 * result, the output was truncated.
\r
653 * @exception IndexOutOfBoundsException if target.length is less than the
\r
657 public static int compose(char[] source,char[] target, boolean compat, int options) {
\r
658 return compose(source, 0, source.length, target, 0, target.length, compat, options);
\r
662 * Compose a string.
\r
663 * The string will be composed to according to the specified mode.
\r
664 * @param src The char array to compose.
\r
665 * @param srcStart Start index of the source
\r
666 * @param srcLimit Limit index of the source
\r
667 * @param dest The char buffer to fill in
\r
668 * @param destStart Start index of the destination buffer
\r
669 * @param destLimit End index of the destination buffer
\r
670 * @param compat If true the char array will be composed according to
\r
671 * NFKC rules and if false will be composed according to
\r
673 * @param options The normalization options, ORed together (0 for no options).
\r
674 * @return int The total buffer size needed;if greater than length of
\r
675 * result, the output was truncated.
\r
676 * @exception IndexOutOfBoundsException if target.length is less than the
\r
680 public static int compose(char[] src,int srcStart, int srcLimit,
\r
681 char[] dest,int destStart, int destLimit,
\r
682 boolean compat, int options) {
\r
683 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
\r
684 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
\r
685 getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
\r
686 return app.length();
\r
690 * Decompose a string.
\r
691 * The string will be decomposed to according to the specified mode.
\r
692 * @param str The string to decompose.
\r
693 * @param compat If true the string will be decomposed according to NFKD
\r
694 * rules and if false will be decomposed according to NFD
\r
696 * @return String The decomposed string
\r
699 public static String decompose(String str, boolean compat) {
\r
700 return decompose(str,compat,0);
\r
704 * Decompose a string.
\r
705 * The string will be decomposed to according to the specified mode.
\r
706 * @param str The string to decompose.
\r
707 * @param compat If true the string will be decomposed according to NFKD
\r
708 * rules and if false will be decomposed according to NFD
\r
710 * @param options The normalization options, ORed together (0 for no options).
\r
711 * @return String The decomposed string
\r
714 public static String decompose(String str, boolean compat, int options) {
\r
715 return getDecomposeNormalizer2(compat, options).normalize(str);
\r
719 * Decompose a string.
\r
720 * The string will be decomposed to according to the specified mode.
\r
721 * @param source The char array to decompose.
\r
722 * @param target A char buffer to receive the normalized text.
\r
723 * @param compat If true the char array will be decomposed according to NFKD
\r
724 * rules and if false will be decomposed according to
\r
726 * @return int The total buffer size needed;if greater than length of
\r
727 * result,the output was truncated.
\r
728 * @param options The normalization options, ORed together (0 for no options).
\r
729 * @exception IndexOutOfBoundsException if the target capacity is less than
\r
730 * the required length
\r
733 public static int decompose(char[] source,char[] target, boolean compat, int options) {
\r
734 return decompose(source, 0, source.length, target, 0, target.length, compat, options);
\r
738 * Decompose a string.
\r
739 * The string will be decomposed to according to the specified mode.
\r
740 * @param src The char array to compose.
\r
741 * @param srcStart Start index of the source
\r
742 * @param srcLimit Limit index of the source
\r
743 * @param dest The char buffer to fill in
\r
744 * @param destStart Start index of the destination buffer
\r
745 * @param destLimit End index of the destination buffer
\r
746 * @param compat If true the char array will be decomposed according to NFKD
\r
747 * rules and if false will be decomposed according to
\r
749 * @param options The normalization options, ORed together (0 for no options).
\r
750 * @return int The total buffer size needed;if greater than length of
\r
751 * result,the output was truncated.
\r
752 * @exception IndexOutOfBoundsException if the target capacity is less than
\r
753 * the required length
\r
756 public static int decompose(char[] src,int srcStart, int srcLimit,
\r
757 char[] dest,int destStart, int destLimit,
\r
758 boolean compat, int options) {
\r
759 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
\r
760 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
\r
761 getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
\r
762 return app.length();
\r
766 * Normalizes a <tt>String</tt> using the given normalization operation.
\r
768 * The <tt>options</tt> parameter specifies which optional
\r
769 * <tt>Normalizer</tt> features are to be enabled for this operation.
\r
770 * Currently the only available option is {@link #UNICODE_3_2}.
\r
771 * If you want the default behavior corresponding to one of the standard
\r
772 * Unicode Normalization Forms, use 0 for this argument.
\r
774 * @param str the input string to be normalized.
\r
775 * @param mode the normalization mode
\r
776 * @param options the optional features to be enabled.
\r
777 * @return String the normalized string
\r
780 public static String normalize(String str, Mode mode, int options) {
\r
781 return mode.getNormalizer2(options).normalize(str);
\r
785 * Normalize a string.
\r
786 * The string will be normalized according to the specified normalization
\r
787 * mode and options.
\r
788 * @param src The string to normalize.
\r
789 * @param mode The normalization mode; one of Normalizer.NONE,
\r
790 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
\r
791 * Normalizer.NFKD, Normalizer.DEFAULT
\r
792 * @return the normalized string
\r
796 public static String normalize(String src,Mode mode) {
\r
797 return normalize(src, mode, 0);
\r
800 * Normalize a string.
\r
801 * The string will be normalized according to the specified normalization
\r
802 * mode and options.
\r
803 * @param source The char array to normalize.
\r
804 * @param target A char buffer to receive the normalized text.
\r
805 * @param mode The normalization mode; one of Normalizer.NONE,
\r
806 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
\r
807 * Normalizer.NFKD, Normalizer.DEFAULT
\r
808 * @param options The normalization options, ORed together (0 for no options).
\r
809 * @return int The total buffer size needed;if greater than length of
\r
810 * result, the output was truncated.
\r
811 * @exception IndexOutOfBoundsException if the target capacity is less
\r
812 * than the required length
\r
815 public static int normalize(char[] source,char[] target, Mode mode, int options) {
\r
816 return normalize(source,0,source.length,target,0,target.length,mode, options);
\r
820 * Normalize a string.
\r
821 * The string will be normalized according to the specified normalization
\r
822 * mode and options.
\r
823 * @param src The char array to compose.
\r
824 * @param srcStart Start index of the source
\r
825 * @param srcLimit Limit index of the source
\r
826 * @param dest The char buffer to fill in
\r
827 * @param destStart Start index of the destination buffer
\r
828 * @param destLimit End index of the destination buffer
\r
829 * @param mode The normalization mode; one of Normalizer.NONE,
\r
830 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
\r
831 * Normalizer.NFKD, Normalizer.DEFAULT
\r
832 * @param options The normalization options, ORed together (0 for no options).
\r
833 * @return int The total buffer size needed;if greater than length of
\r
834 * result, the output was truncated.
\r
835 * @exception IndexOutOfBoundsException if the target capacity is
\r
836 * less than the required length
\r
839 public static int normalize(char[] src,int srcStart, int srcLimit,
\r
840 char[] dest,int destStart, int destLimit,
\r
841 Mode mode, int options) {
\r
842 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
\r
843 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
\r
844 mode.getNormalizer2(options).normalize(srcBuffer, app);
\r
845 return app.length();
\r
849 * Normalize a codepoint according to the given mode
\r
850 * @param char32 The input string to be normalized.
\r
851 * @param mode The normalization mode
\r
852 * @param options Options for use with exclusion set and tailored Normalization
\r
853 * The only option that is currently recognized is UNICODE_3_2
\r
854 * @return String The normalized string
\r
856 * @see #UNICODE_3_2
\r
858 public static String normalize(int char32, Mode mode, int options) {
\r
859 if(mode == NFD && options == 0) {
\r
860 String decomposition =
\r
861 Norm2AllModes.getNFCInstance().impl.getDecomposition(char32);
\r
862 if(decomposition == null) {
\r
863 decomposition = UTF16.valueOf(char32);
\r
865 return decomposition;
\r
867 return normalize(UTF16.valueOf(char32), mode, options);
\r
871 * Convenience method to normalize a codepoint according to the given mode
\r
872 * @param char32 The input string to be normalized.
\r
873 * @param mode The normalization mode
\r
874 * @return String The normalized string
\r
877 public static String normalize(int char32, Mode mode) {
\r
878 return normalize(char32, mode, 0);
\r
882 * Convenience method.
\r
884 * @param source string for determining if it is in a normalized format
\r
885 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
\r
886 * Normalizer.NFKC,Normalizer.NFKD)
\r
887 * @return Return code to specify if the text is normalized or not
\r
888 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
\r
891 public static QuickCheckResult quickCheck(String source, Mode mode) {
\r
892 return quickCheck(source, mode, 0);
\r
896 * Performing quick check on a string, to quickly determine if the string is
\r
897 * in a particular normalization format.
\r
898 * Three types of result can be returned Normalizer.YES, Normalizer.NO or
\r
899 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
\r
900 * string is in the desired normalized format, Normalizer.NO determines that
\r
901 * argument string is not in the desired normalized format. A
\r
902 * Normalizer.MAYBE result indicates that a more thorough check is required,
\r
903 * the user may have to put the string in its normalized form and compare
\r
906 * @param source string for determining if it is in a normalized format
\r
907 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
\r
908 * Normalizer.NFKC,Normalizer.NFKD)
\r
909 * @param options Options for use with exclusion set and tailored Normalization
\r
910 * The only option that is currently recognized is UNICODE_3_2
\r
911 * @return Return code to specify if the text is normalized or not
\r
912 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
\r
915 public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
\r
916 return mode.getNormalizer2(options).quickCheck(source);
\r
920 * Convenience method.
\r
922 * @param source Array of characters for determining if it is in a
\r
923 * normalized format
\r
924 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
\r
925 * Normalizer.NFKC,Normalizer.NFKD)
\r
926 * @param options Options for use with exclusion set and tailored Normalization
\r
927 * The only option that is currently recognized is UNICODE_3_2
\r
928 * @return Return code to specify if the text is normalized or not
\r
929 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
\r
932 public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
\r
933 return quickCheck(source, 0, source.length, mode, options);
\r
937 * Performing quick check on a string, to quickly determine if the string is
\r
938 * in a particular normalization format.
\r
939 * Three types of result can be returned Normalizer.YES, Normalizer.NO or
\r
940 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
\r
941 * string is in the desired normalized format, Normalizer.NO determines that
\r
942 * argument string is not in the desired normalized format. A
\r
943 * Normalizer.MAYBE result indicates that a more thorough check is required,
\r
944 * the user may have to put the string in its normalized form and compare
\r
947 * @param source string for determining if it is in a normalized format
\r
948 * @param start the start index of the source
\r
949 * @param limit the limit index of the source it is equal to the length
\r
950 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD,
\r
951 * Normalizer.NFKC,Normalizer.NFKD)
\r
952 * @param options Options for use with exclusion set and tailored Normalization
\r
953 * The only option that is currently recognized is UNICODE_3_2
\r
954 * @return Return code to specify if the text is normalized or not
\r
955 * (Normalizer.YES, Normalizer.NO or
\r
956 * Normalizer.MAYBE)
\r
960 public static QuickCheckResult quickCheck(char[] source,int start,
\r
961 int limit, Mode mode,int options) {
\r
962 CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
\r
963 return mode.getNormalizer2(options).quickCheck(srcBuffer);
\r
967 * Test if a string is in a given normalization form.
\r
968 * This is semantically equivalent to source.equals(normalize(source, mode)).
\r
970 * Unlike quickCheck(), this function returns a definitive result,
\r
972 * For NFD, NFKD, and FCD, both functions work exactly the same.
\r
973 * For NFC and NFKC where quickCheck may return "maybe", this function will
\r
974 * perform further tests to arrive at a true/false result.
\r
975 * @param src The input array of characters to be checked to see if
\r
977 * @param start The strart index in the source
\r
978 * @param limit The limit index in the source
\r
979 * @param mode the normalization mode
\r
980 * @param options Options for use with exclusion set and tailored Normalization
\r
981 * The only option that is currently recognized is UNICODE_3_2
\r
982 * @return Boolean value indicating whether the source string is in the
\r
983 * "mode" normalization form
\r
986 public static boolean isNormalized(char[] src,int start,
\r
987 int limit, Mode mode,
\r
989 CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
\r
990 return mode.getNormalizer2(options).isNormalized(srcBuffer);
\r
994 * Test if a string is in a given normalization form.
\r
995 * This is semantically equivalent to source.equals(normalize(source, mode)).
\r
997 * Unlike quickCheck(), this function returns a definitive result,
\r
999 * For NFD, NFKD, and FCD, both functions work exactly the same.
\r
1000 * For NFC and NFKC where quickCheck may return "maybe", this function will
\r
1001 * perform further tests to arrive at a true/false result.
\r
1002 * @param str the input string to be checked to see if it is
\r
1004 * @param mode the normalization mode
\r
1005 * @param options Options for use with exclusion set and tailored Normalization
\r
1006 * The only option that is currently recognized is UNICODE_3_2
\r
1007 * @see #isNormalized
\r
1010 public static boolean isNormalized(String str, Mode mode, int options) {
\r
1011 return mode.getNormalizer2(options).isNormalized(str);
\r
1015 * Convenience Method
\r
1016 * @param char32 the input code point to be checked to see if it is
\r
1018 * @param mode the normalization mode
\r
1019 * @param options Options for use with exclusion set and tailored Normalization
\r
1020 * The only option that is currently recognized is UNICODE_3_2
\r
1022 * @see #isNormalized
\r
1025 public static boolean isNormalized(int char32, Mode mode,int options) {
\r
1026 return isNormalized(UTF16.valueOf(char32), mode, options);
\r
1030 * Compare two strings for canonical equivalence.
\r
1031 * Further options include case-insensitive comparison and
\r
1032 * code point order (as opposed to code unit order).
\r
1034 * Canonical equivalence between two strings is defined as their normalized
\r
1035 * forms (NFD or NFC) being identical.
\r
1036 * This function compares strings incrementally instead of normalizing
\r
1037 * (and optionally case-folding) both strings entirely,
\r
1038 * improving performance significantly.
\r
1040 * Bulk normalization is only necessary if the strings do not fulfill the
\r
1041 * FCD conditions. Only in this case, and only if the strings are relatively
\r
1042 * long, is memory allocated temporarily.
\r
1043 * For FCD strings and short non-FCD strings there is no memory allocation.
\r
1045 * Semantically, this is equivalent to
\r
1046 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
\r
1047 * where code point order and foldCase are all optional.
\r
1049 * @param s1 First source character array.
\r
1050 * @param s1Start start index of source
\r
1051 * @param s1Limit limit of the source
\r
1053 * @param s2 Second source character array.
\r
1054 * @param s2Start start index of the source
\r
1055 * @param s2Limit limit of the source
\r
1057 * @param options A bit set of options:
\r
1058 * - FOLD_CASE_DEFAULT or 0 is used for default options:
\r
1059 * Case-sensitive comparison in code unit order, and the input strings
\r
1060 * are quick-checked for FCD.
\r
1063 * Set if the caller knows that both s1 and s2 fulfill the FCD
\r
1064 * conditions.If not set, the function will quickCheck for FCD
\r
1065 * and normalize if necessary.
\r
1067 * - COMPARE_CODE_POINT_ORDER
\r
1068 * Set to choose code point order instead of code unit order
\r
1070 * - COMPARE_IGNORE_CASE
\r
1071 * Set to compare strings case-insensitively using case folding,
\r
1072 * instead of case-sensitively.
\r
1073 * If set, then the following case folding options are used.
\r
1076 * @return <0 or 0 or >0 as usual for string comparisons
\r
1082 public static int compare(char[] s1, int s1Start, int s1Limit,
\r
1083 char[] s2, int s2Start, int s2Limit,
\r
1085 if( s1==null || s1Start<0 || s1Limit<0 ||
\r
1086 s2==null || s2Start<0 || s2Limit<0 ||
\r
1087 s1Limit<s1Start || s2Limit<s2Start
\r
1089 throw new IllegalArgumentException();
\r
1091 return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
\r
1092 CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
\r
1097 * Compare two strings for canonical equivalence.
\r
1098 * Further options include case-insensitive comparison and
\r
1099 * code point order (as opposed to code unit order).
\r
1101 * Canonical equivalence between two strings is defined as their normalized
\r
1102 * forms (NFD or NFC) being identical.
\r
1103 * This function compares strings incrementally instead of normalizing
\r
1104 * (and optionally case-folding) both strings entirely,
\r
1105 * improving performance significantly.
\r
1107 * Bulk normalization is only necessary if the strings do not fulfill the
\r
1108 * FCD conditions. Only in this case, and only if the strings are relatively
\r
1109 * long, is memory allocated temporarily.
\r
1110 * For FCD strings and short non-FCD strings there is no memory allocation.
\r
1112 * Semantically, this is equivalent to
\r
1113 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
\r
1114 * where code point order and foldCase are all optional.
\r
1116 * @param s1 First source string.
\r
1117 * @param s2 Second source string.
\r
1119 * @param options A bit set of options:
\r
1120 * - FOLD_CASE_DEFAULT or 0 is used for default options:
\r
1121 * Case-sensitive comparison in code unit order, and the input strings
\r
1122 * are quick-checked for FCD.
\r
1125 * Set if the caller knows that both s1 and s2 fulfill the FCD
\r
1126 * conditions. If not set, the function will quickCheck for FCD
\r
1127 * and normalize if necessary.
\r
1129 * - COMPARE_CODE_POINT_ORDER
\r
1130 * Set to choose code point order instead of code unit order
\r
1132 * - COMPARE_IGNORE_CASE
\r
1133 * Set to compare strings case-insensitively using case folding,
\r
1134 * instead of case-sensitively.
\r
1135 * If set, then the following case folding options are used.
\r
1137 * @return <0 or 0 or >0 as usual for string comparisons
\r
1143 public static int compare(String s1, String s2, int options) {
\r
1144 return internalCompare(s1, s2, options);
\r
1148 * Compare two strings for canonical equivalence.
\r
1149 * Further options include case-insensitive comparison and
\r
1150 * code point order (as opposed to code unit order).
\r
1151 * Convenience method.
\r
1153 * @param s1 First source string.
\r
1154 * @param s2 Second source string.
\r
1156 * @param options A bit set of options:
\r
1157 * - FOLD_CASE_DEFAULT or 0 is used for default options:
\r
1158 * Case-sensitive comparison in code unit order, and the input strings
\r
1159 * are quick-checked for FCD.
\r
1162 * Set if the caller knows that both s1 and s2 fulfill the FCD
\r
1163 * conditions. If not set, the function will quickCheck for FCD
\r
1164 * and normalize if necessary.
\r
1166 * - COMPARE_CODE_POINT_ORDER
\r
1167 * Set to choose code point order instead of code unit order
\r
1169 * - COMPARE_IGNORE_CASE
\r
1170 * Set to compare strings case-insensitively using case folding,
\r
1171 * instead of case-sensitively.
\r
1172 * If set, then the following case folding options are used.
\r
1174 * @return <0 or 0 or >0 as usual for string comparisons
\r
1180 public static int compare(char[] s1, char[] s2, int options) {
\r
1181 return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
\r
1185 * Convenience method that can have faster implementation
\r
1186 * by not allocating buffers.
\r
1187 * @param char32a the first code point to be checked against the
\r
1188 * @param char32b the second code point
\r
1189 * @param options A bit set of options
\r
1192 public static int compare(int char32a, int char32b, int options) {
\r
1193 return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
\r
1197 * Convenience method that can have faster implementation
\r
1198 * by not allocating buffers.
\r
1199 * @param char32a the first code point to be checked against
\r
1200 * @param str2 the second string
\r
1201 * @param options A bit set of options
\r
1204 public static int compare(int char32a, String str2, int options) {
\r
1205 return internalCompare(UTF16.valueOf(char32a), str2, options);
\r
1208 /* Concatenation of normalized strings --------------------------------- */
\r
1210 * Concatenate normalized strings, making sure that the result is normalized
\r
1213 * If both the left and the right strings are in
\r
1214 * the normalization form according to "mode",
\r
1215 * then the result will be
\r
1218 * dest=normalize(left+right, mode)
\r
1221 * With the input strings already being normalized,
\r
1222 * this function will use next() and previous()
\r
1223 * to find the adjacent end pieces of the input strings.
\r
1224 * Only the concatenation of these end pieces will be normalized and
\r
1225 * then concatenated with the remaining parts of the input strings.
\r
1227 * It is allowed to have dest==left to avoid copying the entire left string.
\r
1229 * @param left Left source array, may be same as dest.
\r
1230 * @param leftStart start in the left array.
\r
1231 * @param leftLimit limit in the left array (==length)
\r
1232 * @param right Right source array.
\r
1233 * @param rightStart start in the right array.
\r
1234 * @param rightLimit limit in the right array (==length)
\r
1235 * @param dest The output buffer; can be null if destStart==destLimit==0
\r
1236 * for pure preflighting.
\r
1237 * @param destStart start in the destination array
\r
1238 * @param destLimit limit in the destination array (==length)
\r
1239 * @param mode The normalization mode.
\r
1240 * @param options The normalization options, ORed together (0 for no options).
\r
1241 * @return Length of output (number of chars) when successful or
\r
1242 * IndexOutOfBoundsException
\r
1243 * @exception IndexOutOfBoundsException whose message has the string
\r
1244 * representation of destination capacity required.
\r
1248 * @exception IndexOutOfBoundsException if target capacity is less than the
\r
1252 public static int concatenate(char[] left, int leftStart, int leftLimit,
\r
1253 char[] right, int rightStart, int rightLimit,
\r
1254 char[] dest, int destStart, int destLimit,
\r
1255 Normalizer.Mode mode, int options) {
\r
1256 if(dest == null) {
\r
1257 throw new IllegalArgumentException();
\r
1260 /* check for overlapping right and destination */
\r
1261 if (right == dest && rightStart < destLimit && destStart < rightLimit) {
\r
1262 throw new IllegalArgumentException("overlapping right and dst ranges");
\r
1265 /* allow left==dest */
\r
1266 StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
\r
1267 destBuilder.append(left, leftStart, leftLimit-leftStart);
\r
1268 CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
\r
1269 mode.getNormalizer2(options).append(destBuilder, rightBuffer);
\r
1270 int destLength=destBuilder.length();
\r
1271 if(destLength<=(destLimit-destStart)) {
\r
1272 destBuilder.getChars(0, destLength, dest, destStart);
\r
1273 return destLength;
\r
1275 throw new IndexOutOfBoundsException(Integer.toString(destLength));
\r
1280 * Concatenate normalized strings, making sure that the result is normalized
\r
1283 * If both the left and the right strings are in
\r
1284 * the normalization form according to "mode",
\r
1285 * then the result will be
\r
1288 * dest=normalize(left+right, mode)
\r
1291 * For details see concatenate
\r
1293 * @param left Left source string.
\r
1294 * @param right Right source string.
\r
1295 * @param mode The normalization mode.
\r
1296 * @param options The normalization options, ORed together (0 for no options).
\r
1299 * @see #concatenate
\r
1303 * @see #concatenate
\r
1306 public static String concatenate(char[] left, char[] right,Mode mode, int options) {
\r
1307 StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
\r
1308 return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
\r
1312 * Concatenate normalized strings, making sure that the result is normalized
\r
1315 * If both the left and the right strings are in
\r
1316 * the normalization form according to "mode",
\r
1317 * then the result will be
\r
1320 * dest=normalize(left+right, mode)
\r
1323 * With the input strings already being normalized,
\r
1324 * this function will use next() and previous()
\r
1325 * to find the adjacent end pieces of the input strings.
\r
1326 * Only the concatenation of these end pieces will be normalized and
\r
1327 * then concatenated with the remaining parts of the input strings.
\r
1329 * @param left Left source string.
\r
1330 * @param right Right source string.
\r
1331 * @param mode The normalization mode.
\r
1332 * @param options The normalization options, ORed together (0 for no options).
\r
1335 * @see #concatenate
\r
1339 * @see #concatenate
\r
1342 public static String concatenate(String left, String right, Mode mode, int options) {
\r
1343 StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
\r
1344 return mode.getNormalizer2(options).append(dest, right).toString();
\r
1348 * Gets the FC_NFKC closure value.
\r
1349 * @param c The code point whose closure value is to be retrieved
\r
1350 * @param dest The char array to receive the closure value
\r
1351 * @return the length of the closure value; 0 if there is none
\r
1354 public static int getFC_NFKC_Closure(int c,char[] dest) {
\r
1355 String closure=getFC_NFKC_Closure(c);
\r
1356 int length=closure.length();
\r
1357 if(length!=0 && dest!=null && length<=dest.length) {
\r
1358 closure.getChars(0, length, dest, 0);
\r
1363 * Gets the FC_NFKC closure value.
\r
1364 * @param c The code point whose closure value is to be retrieved
\r
1365 * @return String representation of the closure value; "" if there is none
\r
1368 public static String getFC_NFKC_Closure(int c) {
\r
1369 // Compute the FC_NFKC_Closure on the fly:
\r
1370 // We have the API for complete coverage of Unicode properties, although
\r
1371 // this value by itself is not useful via API.
\r
1372 // (What could be useful is a custom normalization table that combines
\r
1373 // case folding and NFKC.)
\r
1374 // For the derivation, see Unicode's DerivedNormalizationProps.txt.
\r
1375 Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
\r
1378 csp=UCaseProps.getSingleton();
\r
1379 } catch(IOException e) {
\r
1380 throw new RuntimeException(e);
\r
1382 // first: b = NFKC(Fold(a))
\r
1383 StringBuffer folded=new StringBuffer();
\r
1384 int folded1Length=csp.toFullFolding(c, folded, 0);
\r
1385 if(folded1Length<0) {
\r
1386 Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
\r
1387 if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
\r
1388 return ""; // c does not change at all under CaseFolding+NFKC
\r
1390 folded.appendCodePoint(c);
\r
1392 if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
\r
1393 folded.appendCodePoint(folded1Length);
\r
1396 String kc1=nfkc.normalize(folded);
\r
1397 // second: c = NFKC(Fold(b))
\r
1398 String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
\r
1399 // if (c != b) add the mapping from a to c
\r
1400 if(kc1.equals(kc2)) {
\r
1407 //-------------------------------------------------------------------------
\r
1409 //-------------------------------------------------------------------------
\r
1412 * Return the current character in the normalized text.
\r
1413 * @return The codepoint as an int
\r
1416 public int current() {
\r
1417 if(bufferPos<buffer.length() || nextNormalize()) {
\r
1418 return buffer.codePointAt(bufferPos);
\r
1425 * Return the next character in the normalized text and advance
\r
1426 * the iteration position by one. If the end
\r
1427 * of the text has already been reached, {@link #DONE} is returned.
\r
1428 * @return The codepoint as an int
\r
1431 public int next() {
\r
1432 if(bufferPos<buffer.length() || nextNormalize()) {
\r
1433 int c=buffer.codePointAt(bufferPos);
\r
1434 bufferPos+=Character.charCount(c);
\r
1443 * Return the previous character in the normalized text and decrement
\r
1444 * the iteration position by one. If the beginning
\r
1445 * of the text has already been reached, {@link #DONE} is returned.
\r
1446 * @return The codepoint as an int
\r
1449 public int previous() {
\r
1450 if(bufferPos>0 || previousNormalize()) {
\r
1451 int c=buffer.codePointBefore(bufferPos);
\r
1452 bufferPos-=Character.charCount(c);
\r
1460 * Reset the index to the beginning of the text.
\r
1461 * This is equivalent to setIndexOnly(startIndex)).
\r
1464 public void reset() {
\r
1465 text.setToStart();
\r
1466 currentIndex=nextIndex=0;
\r
1471 * Set the iteration position in the input text that is being normalized,
\r
1472 * without any immediate normalization.
\r
1473 * After setIndexOnly(), getIndex() will return the same index that is
\r
1476 * @param index the desired index in the input text.
\r
1479 public void setIndexOnly(int index) {
\r
1480 text.setIndex(index); // validates index
\r
1481 currentIndex=nextIndex=index;
\r
1486 * Set the iteration position in the input text that is being normalized
\r
1487 * and return the first normalized character at that position.
\r
1489 * <b>Note:</b> This method sets the position in the <em>input</em> text,
\r
1490 * while {@link #next} and {@link #previous} iterate through characters
\r
1491 * in the normalized <em>output</em>. This means that there is not
\r
1492 * necessarily a one-to-one correspondence between characters returned
\r
1493 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
\r
1494 * returned from <tt>setIndex</tt> and {@link #getIndex}.
\r
1496 * @param index the desired index in the input text.
\r
1498 * @return the first normalized character that is the result of iterating
\r
1499 * forward starting at the given index.
\r
1501 * @throws IllegalArgumentException if the given index is less than
\r
1502 * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
\r
1503 * @deprecated ICU 3.2
\r
1504 * @obsolete ICU 3.2
\r
1507 public int setIndex(int index) {
\r
1508 setIndexOnly(index);
\r
1513 * Retrieve the index of the start of the input text. This is the begin
\r
1514 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
\r
1515 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
\r
1516 * @deprecated ICU 2.2. Use startIndex() instead.
\r
1517 * @return The codepoint as an int
\r
1518 * @see #startIndex
\r
1520 public int getBeginIndex() {
\r
1525 * Retrieve the index of the end of the input text. This is the end index
\r
1526 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
\r
1527 * over which this <tt>Normalizer</tt> is iterating
\r
1528 * @deprecated ICU 2.2. Use endIndex() instead.
\r
1529 * @return The codepoint as an int
\r
1532 public int getEndIndex() {
\r
1533 return endIndex();
\r
1536 * Return the first character in the normalized text. This resets
\r
1537 * the <tt>Normalizer's</tt> position to the beginning of the text.
\r
1538 * @return The codepoint as an int
\r
1541 public int first() {
\r
1547 * Return the last character in the normalized text. This resets
\r
1548 * the <tt>Normalizer's</tt> position to be just before the
\r
1549 * the input text corresponding to that normalized character.
\r
1550 * @return The codepoint as an int
\r
1553 public int last() {
\r
1554 text.setToLimit();
\r
1555 currentIndex=nextIndex=text.getIndex();
\r
1557 return previous();
\r
1561 * Retrieve the current iteration position in the input text that is
\r
1562 * being normalized. This method is useful in applications such as
\r
1563 * searching, where you need to be able to determine the position in
\r
1564 * the input text that corresponds to a given normalized output character.
\r
1566 * <b>Note:</b> This method sets the position in the <em>input</em>, while
\r
1567 * {@link #next} and {@link #previous} iterate through characters in the
\r
1568 * <em>output</em>. This means that there is not necessarily a one-to-one
\r
1569 * correspondence between characters returned by <tt>next</tt> and
\r
1570 * <tt>previous</tt> and the indices passed to and returned from
\r
1571 * <tt>setIndex</tt> and {@link #getIndex}.
\r
1572 * @return The current iteration position
\r
1575 public int getIndex() {
\r
1576 if(bufferPos<buffer.length()) {
\r
1577 return currentIndex;
\r
1584 * Retrieve the index of the start of the input text. This is the begin
\r
1585 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
\r
1586 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
\r
1587 * @return The current iteration position
\r
1590 public int startIndex() {
\r
1595 * Retrieve the index of the end of the input text. This is the end index
\r
1596 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
\r
1597 * over which this <tt>Normalizer</tt> is iterating
\r
1598 * @return The current iteration position
\r
1601 public int endIndex() {
\r
1602 return text.getLength();
\r
1605 //-------------------------------------------------------------------------
\r
1606 // Iterator attributes
\r
1607 //-------------------------------------------------------------------------
\r
1609 * Set the normalization mode for this object.
\r
1611 * <b>Note:</b>If the normalization mode is changed while iterating
\r
1612 * over a string, calls to {@link #next} and {@link #previous} may
\r
1613 * return previously buffers characters in the old normalization mode
\r
1614 * until the iteration is able to re-sync at the next base character.
\r
1615 * It is safest to call {@link #setText setText()}, {@link #first},
\r
1616 * {@link #last}, etc. after calling <tt>setMode</tt>.
\r
1618 * @param newMode the new mode for this <tt>Normalizer</tt>.
\r
1619 * The supported modes are:
\r
1621 * <li>{@link #NFC} - Unicode canonical decompositiion
\r
1622 * followed by canonical composition.
\r
1623 * <li>{@link #NFKC} - Unicode compatibility decompositiion
\r
1624 * follwed by canonical composition.
\r
1625 * <li>{@link #NFD} - Unicode canonical decomposition
\r
1626 * <li>{@link #NFKD} - Unicode compatibility decomposition.
\r
1627 * <li>{@link #NONE} - Do nothing but return characters
\r
1628 * from the underlying input text.
\r
1634 public void setMode(Mode newMode) {
\r
1636 norm2 = mode.getNormalizer2(options);
\r
1639 * Return the basic operation performed by this <tt>Normalizer</tt>
\r
1644 public Mode getMode() {
\r
1648 * Set options that affect this <tt>Normalizer</tt>'s operation.
\r
1649 * Options do not change the basic composition or decomposition operation
\r
1650 * that is being performed , but they control whether
\r
1651 * certain optional portions of the operation are done.
\r
1652 * Currently the only available option is:
\r
1655 * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
\r
1658 * @param option the option whose value is to be set.
\r
1659 * @param value the new setting for the option. Use <tt>true</tt> to
\r
1660 * turn the option on and <tt>false</tt> to turn it off.
\r
1665 public void setOption(int option,boolean value) {
\r
1667 options |= option;
\r
1669 options &= (~option);
\r
1671 norm2 = mode.getNormalizer2(options);
\r
1675 * Determine whether an option is turned on or off.
\r
1680 public int getOption(int option) {
\r
1681 if((options & option)!=0) {
\r
1689 * Gets the underlying text storage
\r
1690 * @param fillIn the char buffer to fill the UTF-16 units.
\r
1691 * The length of the buffer should be equal to the length of the
\r
1692 * underlying text storage
\r
1693 * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
\r
1697 public int getText(char[] fillIn) {
\r
1698 return text.getText(fillIn);
\r
1702 * Gets the length of underlying text storage
\r
1703 * @return the length
\r
1706 public int getLength() {
\r
1707 return text.getLength();
\r
1711 * Returns the text under iteration as a string
\r
1712 * @return a copy of the text under iteration.
\r
1715 public String getText() {
\r
1716 return text.getText();
\r
1720 * Set the input text over which this <tt>Normalizer</tt> will iterate.
\r
1721 * The iteration position is set to the beginning of the input text.
\r
1722 * @param newText The new string to be normalized.
\r
1725 public void setText(StringBuffer newText) {
\r
1726 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
\r
1727 if (newIter == null) {
\r
1728 throw new IllegalStateException("Could not create a new UCharacterIterator");
\r
1735 * Set the input text over which this <tt>Normalizer</tt> will iterate.
\r
1736 * The iteration position is set to the beginning of the input text.
\r
1737 * @param newText The new string to be normalized.
\r
1740 public void setText(char[] newText) {
\r
1741 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
\r
1742 if (newIter == null) {
\r
1743 throw new IllegalStateException("Could not create a new UCharacterIterator");
\r
1750 * Set the input text over which this <tt>Normalizer</tt> will iterate.
\r
1751 * The iteration position is set to the beginning of the input text.
\r
1752 * @param newText The new string to be normalized.
\r
1755 public void setText(String newText) {
\r
1756 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
\r
1757 if (newIter == null) {
\r
1758 throw new IllegalStateException("Could not create a new UCharacterIterator");
\r
1765 * Set the input text over which this <tt>Normalizer</tt> will iterate.
\r
1766 * The iteration position is set to the beginning of the input text.
\r
1767 * @param newText The new string to be normalized.
\r
1770 public void setText(CharacterIterator newText) {
\r
1771 UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
\r
1772 if (newIter == null) {
\r
1773 throw new IllegalStateException("Could not create a new UCharacterIterator");
\r
1780 * Set the input text over which this <tt>Normalizer</tt> will iterate.
\r
1781 * The iteration position is set to the beginning of the string.
\r
1782 * @param newText The new string to be normalized.
\r
1785 public void setText(UCharacterIterator newText) {
\r
1787 UCharacterIterator newIter = (UCharacterIterator)newText.clone();
\r
1788 if (newIter == null) {
\r
1789 throw new IllegalStateException("Could not create a new UCharacterIterator");
\r
1793 }catch(CloneNotSupportedException e) {
\r
1794 throw new IllegalStateException("Could not clone the UCharacterIterator");
\r
1798 private void clearBuffer() {
\r
1799 buffer.setLength(0);
\r
1803 private boolean nextNormalize() {
\r
1805 currentIndex=nextIndex;
\r
1806 text.setIndex(nextIndex);
\r
1807 // Skip at least one character so we make progress.
\r
1808 int c=text.nextCodePoint();
\r
1812 StringBuilder segment=new StringBuilder().appendCodePoint(c);
\r
1813 while((c=text.nextCodePoint())>=0) {
\r
1814 if(norm2.hasBoundaryBefore(c)) {
\r
1815 text.moveCodePointIndex(-1);
\r
1818 segment.appendCodePoint(c);
\r
1820 nextIndex=text.getIndex();
\r
1821 norm2.normalize(segment, buffer);
\r
1822 return buffer.length()!=0;
\r
1825 private boolean previousNormalize() {
\r
1827 nextIndex=currentIndex;
\r
1828 text.setIndex(currentIndex);
\r
1829 StringBuilder segment=new StringBuilder();
\r
1831 while((c=text.previousCodePoint())>=0) {
\r
1833 segment.insert(0, (char)c);
\r
1835 segment.insert(0, Character.toChars(c));
\r
1837 if(norm2.hasBoundaryBefore(c)) {
\r
1841 currentIndex=text.getIndex();
\r
1842 norm2.normalize(segment, buffer);
\r
1843 bufferPos=buffer.length();
\r
1844 return buffer.length()!=0;
\r
1847 /* compare canonically equivalent ------------------------------------------- */
\r
1849 // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
\r
1850 private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
\r
1851 int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
\r
1852 options|= COMPARE_EQUIV;
\r
1855 * UAX #21 Case Mappings, as fixed for Unicode version 4
\r
1856 * (see Jitterbug 2021), defines a canonical caseless match as
\r
1858 * A string X is a canonical caseless match
\r
1859 * for a string Y if and only if
\r
1860 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
\r
1862 * For better performance, we check for FCD (or let the caller tell us that
\r
1863 * both strings are in FCD) for the inner normalization.
\r
1864 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
\r
1865 * case-folding preserves the FCD-ness of a string.
\r
1866 * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
\r
1867 * when there is a difference.
\r
1869 * Exception: When using the Turkic case-folding option, we do perform
\r
1870 * full NFD first. This is because in the Turkic case precomposed characters
\r
1871 * with 0049 capital I or 0069 small i fold differently whether they
\r
1872 * are first decomposed or not, so an FCD check - a check only for
\r
1873 * canonical order - is not sufficient.
\r
1875 if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
\r
1877 if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
\r
1878 n2=NFD.getNormalizer2(normOptions);
\r
1880 n2=FCD.getNormalizer2(normOptions);
\r
1883 // check if s1 and/or s2 fulfill the FCD conditions
\r
1884 int spanQCYes1=n2.spanQuickCheckYes(s1);
\r
1885 int spanQCYes2=n2.spanQuickCheckYes(s2);
\r
1888 * ICU 2.4 had a further optimization:
\r
1889 * If both strings were not in FCD, then they were both NFD'ed,
\r
1890 * and the COMPARE_EQUIV option was turned off.
\r
1891 * It is not entirely clear that this is valid with the current
\r
1892 * definition of the canonical caseless match.
\r
1893 * Therefore, ICU 2.6 removes that optimization.
\r
1896 if(spanQCYes1<s1.length()) {
\r
1897 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
\r
1898 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
\r
1900 if(spanQCYes2<s2.length()) {
\r
1901 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
\r
1902 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
\r
1906 return cmpEquivFold(s1, s2, options);
\r
1910 * Compare two strings for canonical equivalence.
\r
1911 * Further options include case-insensitive comparison and
\r
1912 * code point order (as opposed to code unit order).
\r
1914 * In this function, canonical equivalence is optional as well.
\r
1915 * If canonical equivalence is tested, then both strings must fulfill
\r
1918 * Semantically, this is equivalent to
\r
1919 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
\r
1920 * where code point order, NFD and foldCase are all optional.
\r
1922 * String comparisons almost always yield results before processing both strings
\r
1924 * They are generally more efficient working incrementally instead of
\r
1925 * performing the sub-processing (strlen, normalization, case-folding)
\r
1926 * on the entire strings first.
\r
1928 * It is also unnecessary to not normalize identical characters.
\r
1930 * This function works in principle as follows:
\r
1933 * get one code unit c1 from s1 (-1 if end of source)
\r
1934 * get one code unit c2 from s2 (-1 if end of source)
\r
1936 * if(either string finished) {
\r
1944 * try to decompose/case-fold c1/c2, and continue if one does;
\r
1946 * // still c1!=c2 and neither decomposes/case-folds, return result
\r
1950 * When a character decomposes, then the pointer for that source changes to
\r
1951 * the decomposition, pushing the previous pointer onto a stack.
\r
1952 * When the end of the decomposition is reached, then the code unit reader
\r
1953 * pops the previous source from the stack.
\r
1954 * (Same for case-folding.)
\r
1956 * This is complicated further by operating on variable-width UTF-16.
\r
1957 * The top part of the loop works on code units, while lookups for decomposition
\r
1958 * and case-folding need code points.
\r
1959 * Code points are assembled after the equality/end-of-source part.
\r
1960 * The source pointer is only advanced beyond all code units when the code point
\r
1961 * actually decomposes/case-folds.
\r
1963 * If we were on a trail surrogate unit when assembling a code point,
\r
1964 * and the code point decomposes/case-folds, then the decomposition/folding
\r
1965 * result must be compared with the part of the other string that corresponds to
\r
1966 * this string's lead surrogate.
\r
1967 * Since we only assemble a code point when hitting a trail unit when the
\r
1968 * preceding lead units were identical, we back up the other string by one unit
\r
1971 * The optional code point order comparison at the end works with
\r
1972 * the same fix-up as the other code point order comparison functions.
\r
1973 * See ustring.c and the comment near the end of this function.
\r
1975 * Assumption: A decomposition or case-folding result string never contains
\r
1976 * a single surrogate. This is a safe assumption in the Unicode Standard.
\r
1977 * Therefore, we do not need to check for surrogate pairs across
\r
1978 * decomposition/case-folding boundaries.
\r
1980 * Further assumptions (see verifications tstnorm.cpp):
\r
1981 * The API function checks for FCD first, while the core function
\r
1982 * first case-folds and then decomposes. This requires that case-folding does not
\r
1983 * un-FCD any strings.
\r
1985 * The API function may also NFD the input and turn off decomposition.
\r
1986 * This requires that case-folding does not un-NFD strings either.
\r
1988 * TODO If any of the above two assumptions is violated,
\r
1989 * then this entire code must be re-thought.
\r
1990 * If this happens, then a simple solution is to case-fold both strings up front
\r
1991 * and to turn off UNORM_INPUT_IS_FCD.
\r
1992 * We already do this when not both strings are in FCD because makeFCD
\r
1993 * would be a partial NFD before the case folding, which does not work.
\r
1994 * Note that all of this is only a problem when case-folding _and_
\r
1995 * canonical equivalence come together.
\r
1996 * (Comments in unorm_compare() are more up to date than this TODO.)
\r
1999 /* stack element for previous-level source/decomposition pointers */
\r
2000 private static final class CmpEquivLevel {
\r
2004 private static final CmpEquivLevel[] createCmpEquivLevelStack() {
\r
2005 return new CmpEquivLevel[] {
\r
2006 new CmpEquivLevel(), new CmpEquivLevel()
\r
2011 * Internal option for unorm_cmpEquivFold() for decomposing.
\r
2012 * If not set, just do strcasecmp().
\r
2014 private static final int COMPARE_EQUIV=0x80000;
\r
2016 /* internal function; package visibility for use by UTF16.StringComparator */
\r
2017 /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
\r
2018 Normalizer2Impl nfcImpl;
\r
2021 /* current-level start/limit - s1/s2 as current */
\r
2022 int s1, s2, limit1, limit2;
\r
2024 /* decomposition and case folding variables */
\r
2027 /* stacks of previous-level start/current/limit */
\r
2028 CmpEquivLevel[] stack1=null, stack2=null;
\r
2030 /* buffers for algorithmic decompositions */
\r
2031 String decomp1, decomp2;
\r
2033 /* case folding buffers, only use current-level start/limit */
\r
2034 StringBuffer fold1, fold2;
\r
2036 /* track which is the current level per string */
\r
2037 int level1, level2;
\r
2039 /* current code units, and code points for lookups */
\r
2040 int c1, c2, cp1, cp2;
\r
2042 /* no argument error checking because this itself is not an API */
\r
2045 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
\r
2046 * otherwise this function must behave exactly as uprv_strCompare()
\r
2047 * not checking for that here makes testing this function easier
\r
2050 /* normalization/properties data loaded? */
\r
2051 if((options&COMPARE_EQUIV)!=0) {
\r
2052 nfcImpl=Norm2AllModes.getNFCInstance().impl;
\r
2056 if((options&COMPARE_IGNORE_CASE)!=0) {
\r
2058 csp=UCaseProps.getSingleton();
\r
2059 } catch(IOException e) {
\r
2060 throw new RuntimeException(e);
\r
2062 fold1=new StringBuffer();
\r
2063 fold2=new StringBuffer();
\r
2071 limit1=cs1.length();
\r
2073 limit2=cs2.length();
\r
2078 /* comparison loop */
\r
2081 * here a code unit value of -1 means "get another code unit"
\r
2082 * below it will mean "this source is finished"
\r
2086 /* get next code unit from string 1, post-increment */
\r
2094 c1=cs1.charAt(s1++);
\r
2098 /* reached end of level buffer, pop one level */
\r
2101 cs1=stack1[level1].cs;
\r
2102 } while(cs1==null);
\r
2103 s1=stack1[level1].s;
\r
2104 limit1=cs1.length();
\r
2109 /* get next code unit from string 2, post-increment */
\r
2117 c2=cs2.charAt(s2++);
\r
2121 /* reached end of level buffer, pop one level */
\r
2124 cs2=stack2[level2].cs;
\r
2125 } while(cs2==null);
\r
2126 s2=stack2[level2].s;
\r
2127 limit2=cs2.length();
\r
2132 * compare c1 and c2
\r
2133 * either variable c1, c2 is -1 only if the corresponding string is finished
\r
2137 return 0; /* c1==c2==-1 indicating end of strings */
\r
2139 c1=c2=-1; /* make us fetch new code units */
\r
2142 return -1; /* string 1 ends before string 2 */
\r
2144 return 1; /* string 2 ends before string 1 */
\r
2146 /* c1!=c2 && c1>=0 && c2>=0 */
\r
2148 /* get complete code points for c1, c2 for lookups if either is a surrogate */
\r
2150 if(UTF16.isSurrogate((char)c1)) {
\r
2153 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
\r
2154 if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
\r
2155 /* advance ++s1; only below if cp1 decomposes/case-folds */
\r
2156 cp1=Character.toCodePoint((char)c1, c);
\r
2158 } else /* isTrail(c1) */ {
\r
2159 if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
\r
2160 cp1=Character.toCodePoint(c, (char)c1);
\r
2166 if(UTF16.isSurrogate((char)c2)) {
\r
2169 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
\r
2170 if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
\r
2171 /* advance ++s2; only below if cp2 decomposes/case-folds */
\r
2172 cp2=Character.toCodePoint((char)c2, c);
\r
2174 } else /* isTrail(c2) */ {
\r
2175 if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
\r
2176 cp2=Character.toCodePoint(c, (char)c2);
\r
2182 * go down one level for each string
\r
2183 * continue with the main loop as soon as there is a real change
\r
2186 if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
\r
2187 (length=csp.toFullFolding(cp1, fold1, options))>=0
\r
2189 /* cp1 case-folds to the code point "length" or to p[length] */
\r
2190 if(UTF16.isSurrogate((char)c1)) {
\r
2191 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
\r
2192 /* advance beyond source surrogate pair if it case-folds */
\r
2194 } else /* isTrail(c1) */ {
\r
2196 * we got a supplementary code point when hitting its trail surrogate,
\r
2197 * therefore the lead surrogate must have been the same as in the other string;
\r
2198 * compare this decomposition with the lead surrogate in the other string
\r
2199 * remember that this simulates bulk text replacement:
\r
2200 * the decomposition would replace the entire code point
\r
2203 c2=cs2.charAt(s2-1);
\r
2207 /* push current level pointers */
\r
2208 if(stack1==null) {
\r
2209 stack1=createCmpEquivLevelStack();
\r
2215 /* copy the folding result to fold1[] */
\r
2216 /* Java: the buffer was probably not empty, remove the old contents */
\r
2217 if(length<=UCaseProps.MAX_STRING_LENGTH) {
\r
2218 fold1.delete(0, fold1.length()-length);
\r
2220 fold1.setLength(0);
\r
2221 fold1.appendCodePoint(length);
\r
2224 /* set next level pointers to case folding */
\r
2227 limit1=fold1.length();
\r
2229 /* get ready to read from decomposition, continue with loop */
\r
2234 if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
\r
2235 (length=csp.toFullFolding(cp2, fold2, options))>=0
\r
2237 /* cp2 case-folds to the code point "length" or to p[length] */
\r
2238 if(UTF16.isSurrogate((char)c2)) {
\r
2239 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
\r
2240 /* advance beyond source surrogate pair if it case-folds */
\r
2242 } else /* isTrail(c2) */ {
\r
2244 * we got a supplementary code point when hitting its trail surrogate,
\r
2245 * therefore the lead surrogate must have been the same as in the other string;
\r
2246 * compare this decomposition with the lead surrogate in the other string
\r
2247 * remember that this simulates bulk text replacement:
\r
2248 * the decomposition would replace the entire code point
\r
2251 c1=cs1.charAt(s1-1);
\r
2255 /* push current level pointers */
\r
2256 if(stack2==null) {
\r
2257 stack2=createCmpEquivLevelStack();
\r
2263 /* copy the folding result to fold2[] */
\r
2264 /* Java: the buffer was probably not empty, remove the old contents */
\r
2265 if(length<=UCaseProps.MAX_STRING_LENGTH) {
\r
2266 fold2.delete(0, fold2.length()-length);
\r
2268 fold2.setLength(0);
\r
2269 fold2.appendCodePoint(length);
\r
2272 /* set next level pointers to case folding */
\r
2275 limit2=fold2.length();
\r
2277 /* get ready to read from decomposition, continue with loop */
\r
2282 if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
\r
2283 (decomp1=nfcImpl.getDecomposition(cp1))!=null
\r
2285 /* cp1 decomposes into p[length] */
\r
2286 if(UTF16.isSurrogate((char)c1)) {
\r
2287 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
\r
2288 /* advance beyond source surrogate pair if it decomposes */
\r
2290 } else /* isTrail(c1) */ {
\r
2292 * we got a supplementary code point when hitting its trail surrogate,
\r
2293 * therefore the lead surrogate must have been the same as in the other string;
\r
2294 * compare this decomposition with the lead surrogate in the other string
\r
2295 * remember that this simulates bulk text replacement:
\r
2296 * the decomposition would replace the entire code point
\r
2299 c2=cs2.charAt(s2-1);
\r
2303 /* push current level pointers */
\r
2304 if(stack1==null) {
\r
2305 stack1=createCmpEquivLevelStack();
\r
2307 stack1[level1].cs=cs1;
\r
2308 stack1[level1].s=s1;
\r
2311 /* set empty intermediate level if skipped */
\r
2313 stack1[level1++].cs=null;
\r
2316 /* set next level pointers to decomposition */
\r
2319 limit1=decomp1.length();
\r
2321 /* get ready to read from decomposition, continue with loop */
\r
2326 if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
\r
2327 (decomp2=nfcImpl.getDecomposition(cp2))!=null
\r
2329 /* cp2 decomposes into p[length] */
\r
2330 if(UTF16.isSurrogate((char)c2)) {
\r
2331 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
\r
2332 /* advance beyond source surrogate pair if it decomposes */
\r
2334 } else /* isTrail(c2) */ {
\r
2336 * we got a supplementary code point when hitting its trail surrogate,
\r
2337 * therefore the lead surrogate must have been the same as in the other string;
\r
2338 * compare this decomposition with the lead surrogate in the other string
\r
2339 * remember that this simulates bulk text replacement:
\r
2340 * the decomposition would replace the entire code point
\r
2343 c1=cs1.charAt(s1-1);
\r
2347 /* push current level pointers */
\r
2348 if(stack2==null) {
\r
2349 stack2=createCmpEquivLevelStack();
\r
2351 stack2[level2].cs=cs2;
\r
2352 stack2[level2].s=s2;
\r
2355 /* set empty intermediate level if skipped */
\r
2357 stack2[level2++].cs=null;
\r
2360 /* set next level pointers to decomposition */
\r
2363 limit2=decomp2.length();
\r
2365 /* get ready to read from decomposition, continue with loop */
\r
2371 * no decomposition/case folding, max level for both sides:
\r
2372 * return difference result
\r
2374 * code point order comparison must not just return cp1-cp2
\r
2375 * because when single surrogates are present then the surrogate pairs
\r
2376 * that formed cp1 and cp2 may be from different string indexes
\r
2378 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
\r
2379 * c1=d800 cp1=10001 c2=dc00 cp2=10000
\r
2380 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
\r
2382 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
\r
2383 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
\r
2384 * so we have slightly different pointer/start/limit comparisons here
\r
2387 if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
\r
2388 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
\r
2390 (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
\r
2391 (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
\r
2393 /* part of a surrogate pair, leave >=d800 */
\r
2395 /* BMP code point - may be surrogate code point - make <d800 */
\r
2400 (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
\r
2401 (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
\r
2403 /* part of a surrogate pair, leave >=d800 */
\r
2405 /* BMP code point - may be surrogate code point - make <d800 */
\r
2415 * An Appendable that writes into a char array with a capacity that may be
\r
2416 * less than array.length.
\r
2417 * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
\r
2419 * An overflow is only reported at the end, for the old Normalizer API functions that write
\r
2422 private static final class CharsAppendable implements Appendable {
\r
2423 public CharsAppendable(char[] dest, int destStart, int destLimit) {
\r
2425 start=offset=destStart;
\r
2428 public int length() {
\r
2429 int len=offset-start;
\r
2430 if(offset<=limit) {
\r
2433 throw new IndexOutOfBoundsException(Integer.toString(len));
\r
2436 public Appendable append(char c) {
\r
2437 if(offset<limit) {
\r
2443 public Appendable append(CharSequence s) {
\r
2444 return append(s, 0, s.length());
\r
2446 public Appendable append(CharSequence s, int sStart, int sLimit) {
\r
2447 int len=sLimit-sStart;
\r
2448 if(len<=(limit-offset)) {
\r
2449 while(sStart<sLimit) { // TODO: Is there a better way to copy the characters?
\r
2450 chars[offset++]=s.charAt(sStart++);
\r
2458 private final char[] chars;
\r
2459 private final int start, limit;
\r
2460 private int offset;
\r