jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/Normalizer2.java

   1 /*\r
   2 *******************************************************************************\r
   3 *   Copyright (C) 2009-2010, International Business Machines\r
   4 *   Corporation and others.  All Rights Reserved.\r
   5 *******************************************************************************\r
   6 */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.io.InputStream;\r
  10 \r
  11 import com.ibm.icu.impl.Norm2AllModes;\r
  12 \r
  13 /**\r
  14  * Unicode normalization functionality for standard Unicode normalization or\r
  15  * for using custom mapping tables.\r
  16  * All instances of this class are unmodifiable/immutable.\r
  17  * <p>\r
  18  * The primary functions are to produce a normalized string and to detect whether\r
  19  * a string is already normalized.\r
  20  * The most commonly used normalization forms are those defined in\r
  21  * http://www.unicode.org/unicode/reports/tr15/\r
  22  * However, this API supports additional normalization forms for specialized purposes.\r
  23  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)\r
  24  * and can be used in implementations of UTS #46.\r
  25  * <p>\r
  26  * Not only are the standard compose and decompose modes supplied,\r
  27  * but additional modes are provided as documented in the Mode enum.\r
  28  * <p>\r
  29  * Some of the functions in this class identify normalization boundaries.\r
  30  * At a normalization boundary, the portions of the string\r
  31  * before it and starting from it do not interact and can be handled independently.\r
  32  * <p>\r
  33  * The spanQuickCheckYes() stops at a normalization boundary.\r
  34  * When the goal is a normalized string, then the text before the boundary\r
  35  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().\r
  36  * <p>\r
  37  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether\r
  38  * a character is guaranteed to be at a normalization boundary,\r
  39  * regardless of context.\r
  40  * This is used for moving from one normalization boundary to the next\r
  41  * or preceding boundary, and for performing iterative normalization.\r
  42  * <p>\r
  43  * Iterative normalization is useful when only a small portion of a\r
  44  * longer string needs to be processed.\r
  45  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator\r
  46  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()\r
  47  * (to process only the substring for which sort key bytes are computed).\r
  48  * <p>\r
  49  * The set of normalization boundaries returned by these functions may not be\r
  50  * complete: There may be more boundaries that could be returned.\r
  51  * Different functions may return different boundaries.\r
  52  * @draft ICU 4.4\r
  53  * @provisional This API might change or be removed in a future release.\r
  54  * @author Markus W. Scherer\r
  55  */\r
  56 public abstract class Normalizer2 {\r
  57     /**\r
  58      * Constants for normalization modes.\r
  59      * For details about standard Unicode normalization forms\r
  60      * and about the algorithms which are also used with custom mapping tables\r
  61      * see http://www.unicode.org/unicode/reports/tr15/\r
  62      * @draft ICU 4.4\r
  63      * @provisional This API might change or be removed in a future release.\r
  64      */\r
  65     public enum Mode {\r
  66         /**\r
  67          * Decomposition followed by composition.\r
  68          * Same as standard NFC when using an "nfc" instance.\r
  69          * Same as standard NFKC when using an "nfkc" instance.\r
  70          * For details about standard Unicode normalization forms\r
  71          * see http://www.unicode.org/unicode/reports/tr15/\r
  72          * @draft ICU 4.4\r
  73          * @provisional This API might change or be removed in a future release.\r
  74          */\r
  75         COMPOSE,\r
  76         /**\r
  77          * Map, and reorder canonically.\r
  78          * Same as standard NFD when using an "nfc" instance.\r
  79          * Same as standard NFKD when using an "nfkc" instance.\r
  80          * For details about standard Unicode normalization forms\r
  81          * see http://www.unicode.org/unicode/reports/tr15/\r
  82          * @draft ICU 4.4\r
  83          * @provisional This API might change or be removed in a future release.\r
  84          */\r
  85         DECOMPOSE,\r
  86         /**\r
  87          * "Fast C or D" form.\r
  88          * If a string is in this form, then further decomposition <i>without reordering</i>\r
  89          * would yield the same form as DECOMPOSE.\r
  90          * Text in "Fast C or D" form can be processed efficiently with data tables\r
  91          * that are "canonically closed", that is, that provide equivalent data for\r
  92          * equivalent text, without having to be fully normalized.<br>\r
  93          * Not a standard Unicode normalization form.<br>\r
  94          * Not a unique form: Different FCD strings can be canonically equivalent.<br>\r
  95          * For details see http://www.unicode.org/notes/tn5/#FCD\r
  96          * @draft ICU 4.4\r
  97          * @provisional This API might change or be removed in a future release.\r
  98          */\r
  99         FCD,\r
 100         /**\r
 101          * Compose only contiguously.\r
 102          * Also known as "FCC" or "Fast C Contiguous".\r
 103          * The result will often but not always be in NFC.\r
 104          * The result will conform to FCD which is useful for processing.<br>\r
 105          * Not a standard Unicode normalization form.<br>\r
 106          * For details see http://www.unicode.org/notes/tn5/#FCC\r
 107          * @draft ICU 4.4\r
 108          * @provisional This API might change or be removed in a future release.\r
 109          */\r
 110         COMPOSE_CONTIGUOUS\r
 111     };\r
 112 \r
 113     /**\r
 114      * Returns a Normalizer2 instance which uses the specified data file\r
 115      * (an ICU data file if data=null, or else custom binary data)\r
 116      * and which composes or decomposes text according to the specified mode.\r
 117      * Returns an unmodifiable singleton instance.\r
 118      * <ul>\r
 119      * <li>Use data=null for data files that are part of ICU's own data.\r
 120      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.\r
 121      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.\r
 122      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.\r
 123      * </ul>\r
 124      * If data!=null, then the binary data is read once and cached using the provided\r
 125      * name as the key.\r
 126      * If you know or expect the data to be cached already, you can use data!=null\r
 127      * for non-ICU data as well.\r
 128      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data\r
 129      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file\r
 130      * @param mode normalization mode (compose or decompose etc.)\r
 131      * @return the requested Normalizer2, if successful\r
 132      * @draft ICU 4.4\r
 133      * @provisional This API might change or be removed in a future release.\r
 134      */\r
 135     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {\r
 136         Norm2AllModes all2Modes=Norm2AllModes.getInstance(data, name);\r
 137         switch(mode) {\r
 138         case COMPOSE: return all2Modes.comp;\r
 139         case DECOMPOSE: return all2Modes.decomp;\r
 140         case FCD: return all2Modes.fcd;\r
 141         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;\r
 142         default: return null;  // will not occur\r
 143         }\r
 144     }\r
 145 \r
 146     /**\r
 147      * Returns the normalized form of the source string.\r
 148      * @param src source string\r
 149      * @return normalized src\r
 150      * @draft ICU 4.4\r
 151      * @provisional This API might change or be removed in a future release.\r
 152      */\r
 153     public String normalize(CharSequence src) {\r
 154         return normalize(src, new StringBuilder()).toString();\r
 155     }\r
 156     /**\r
 157      * Writes the normalized form of the source string to the destination string\r
 158      * (replacing its contents) and returns the destination string.\r
 159      * The source and destination strings must be different objects.\r
 160      * @param src source string\r
 161      * @param dest destination string; its contents is replaced with normalized src\r
 162      * @return dest\r
 163      * @draft ICU 4.4\r
 164      * @provisional This API might change or be removed in a future release.\r
 165      */\r
 166     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);\r
 167     /**\r
 168      * Writes the normalized form of the source string to the destination Appendable\r
 169      * and returns the destination Appendable.\r
 170      * The source and destination strings must be different objects.\r
 171      * @param src source string\r
 172      * @param dest destination Appendable; gets normalized src appended\r
 173      * @return dest\r
 174      * @internal ICU 4.4 TODO: propose for 4.6\r
 175      * @provisional This API might change or be removed in a future release.\r
 176      */\r
 177     public abstract Appendable normalize(CharSequence src, Appendable dest);\r
 178     /**\r
 179      * Appends the normalized form of the second string to the first string\r
 180      * (merging them at the boundary) and returns the first string.\r
 181      * The result is normalized if the first string was normalized.\r
 182      * The first and second strings must be different objects.\r
 183      * @param first string, should be normalized\r
 184      * @param second string, will be normalized\r
 185      * @return first\r
 186      * @draft ICU 4.4\r
 187      * @provisional This API might change or be removed in a future release.\r
 188      */\r
 189     public abstract StringBuilder normalizeSecondAndAppend(\r
 190             StringBuilder first, CharSequence second);\r
 191     /**\r
 192      * Appends the second string to the first string\r
 193      * (merging them at the boundary) and returns the first string.\r
 194      * The result is normalized if both the strings were normalized.\r
 195      * The first and second strings must be different objects.\r
 196      * @param first string, should be normalized\r
 197      * @param second string, should be normalized\r
 198      * @return first\r
 199      * @draft ICU 4.4\r
 200      * @provisional This API might change or be removed in a future release.\r
 201      */\r
 202     public abstract StringBuilder append(StringBuilder first, CharSequence second);\r
 203 \r
 204     /**\r
 205      * Tests if the string is normalized.\r
 206      * Internally, in cases where the quickCheck() method would return "maybe"\r
 207      * (which is only possible for the two COMPOSE modes) this method\r
 208      * resolves to "yes" or "no" to provide a definitive result,\r
 209      * at the cost of doing more work in those cases.\r
 210      * @param s input string\r
 211      * @return true if s is normalized\r
 212      * @draft ICU 4.4\r
 213      * @provisional This API might change or be removed in a future release.\r
 214      */\r
 215     public abstract boolean isNormalized(CharSequence s);\r
 216 \r
 217     /**\r
 218      * Tests if the string is normalized.\r
 219      * For the two COMPOSE modes, the result could be "maybe" in cases that\r
 220      * would take a little more work to resolve definitively.\r
 221      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster\r
 222      * combination of quick check + normalization, to avoid\r
 223      * re-checking the "yes" prefix.\r
 224      * @param s input string\r
 225      * @return the quick check result\r
 226      * @draft ICU 4.4\r
 227      * @provisional This API might change or be removed in a future release.\r
 228      */\r
 229     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);\r
 230 \r
 231     /**\r
 232      * Returns the end of the normalized substring of the input string.\r
 233      * In other words, with <code>end=spanQuickCheckYes(s);</code>\r
 234      * the substring <code>s.subSequence(0, end)</code>\r
 235      * will pass the quick check with a "yes" result.\r
 236      * <p>\r
 237      * The returned end index is usually one or more characters before the\r
 238      * "no" or "maybe" character: The end index is at a normalization boundary.\r
 239      * (See the class documentation for more about normalization boundaries.)\r
 240      * <p>\r
 241      * When the goal is a normalized string and most input strings are expected\r
 242      * to be normalized already, then call this method,\r
 243      * and if it returns a prefix shorter than the input string,\r
 244      * copy that prefix and use normalizeSecondAndAppend() for the remainder.\r
 245      * @param s input string\r
 246      * @return "yes" span end index\r
 247      * @draft ICU 4.4\r
 248      * @provisional This API might change or be removed in a future release.\r
 249      */\r
 250     public abstract int spanQuickCheckYes(CharSequence s);\r
 251 \r
 252     /**\r
 253      * Tests if the character always has a normalization boundary before it,\r
 254      * regardless of context.\r
 255      * If true, then the character does not normalization-interact with\r
 256      * preceding characters.\r
 257      * In other words, a string containing this character can be normalized\r
 258      * by processing portions before this character and starting from this\r
 259      * character independently.\r
 260      * This is used for iterative normalization. See the class documentation for details.\r
 261      * @param c character to test\r
 262      * @return true if c has a normalization boundary before it\r
 263      * @draft ICU 4.4\r
 264      * @provisional This API might change or be removed in a future release.\r
 265      */\r
 266     public abstract boolean hasBoundaryBefore(int c);\r
 267 \r
 268     /**\r
 269      * Tests if the character always has a normalization boundary after it,\r
 270      * regardless of context.\r
 271      * If true, then the character does not normalization-interact with\r
 272      * following characters.\r
 273      * In other words, a string containing this character can be normalized\r
 274      * by processing portions up to this character and after this\r
 275      * character independently.\r
 276      * This is used for iterative normalization. See the class documentation for details.\r
 277      * <p>\r
 278      * Note that this operation may be significantly slower than hasBoundaryBefore().\r
 279      * @param c character to test\r
 280      * @return true if c has a normalization boundary after it\r
 281      * @draft ICU 4.4\r
 282      * @provisional This API might change or be removed in a future release.\r
 283      */\r
 284     public abstract boolean hasBoundaryAfter(int c);\r
 285 \r
 286     /**\r
 287      * Tests if the character is normalization-inert.\r
 288      * If true, then the character does not change, nor normalization-interact with\r
 289      * preceding or following characters.\r
 290      * In other words, a string containing this character can be normalized\r
 291      * by processing portions before this character and after this\r
 292      * character independently.\r
 293      * This is used for iterative normalization. See the class documentation for details.\r
 294      * <p>\r
 295      * Note that this operation may be significantly slower than hasBoundaryBefore().\r
 296      * @param c character to test\r
 297      * @return true if c is normalization-inert\r
 298      * @draft ICU 4.4\r
 299      * @provisional This API might change or be removed in a future release.\r
 300      */\r
 301     public abstract boolean isInert(int c);\r
 302 \r
 303     /**\r
 304      * Sole constructor.  (For invocation by subclass constructors,\r
 305      * typically implicit.)\r
 306      * @internal\r
 307      * @deprecated This API is ICU internal only.\r
 308      */\r
 309     protected Normalizer2() {\r
 310     }\r
 311 }\r