jars/icu4j-4_4_2-src/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UnicodeNormalizer.java

   1 /*\r
   2  * Copyright (C) 1998-2007 International Business Machines Corporation and\r
   3  * Unicode, Inc. All Rights Reserved.<br>\r
   4  * The Unicode Consortium makes no expressed or implied warranty of any\r
   5  * kind, and assumes no liability for errors or omissions.\r
   6  * No liability is assumed for incidental and consequential damages\r
   7  * in connection with or arising out of the use of the information here.\r
   8  */\r
   9 \r
  10 package com.ibm.icu.dev.test.normalizer;\r
  11 \r
  12 import com.ibm.icu.dev.test.UTF16Util;\r
  13 \r
  14 /**\r
  15  * Implements Unicode Normalization Forms C, D, KC, KD.<br>\r
  16  * See UTR#15 for details.<br>\r
  17  * @author Mark Davis\r
  18  * Updates for supplementary code points:\r
  19  * Vladimir Weinstein & Markus Scherer\r
  20  */\r
  21 public class UnicodeNormalizer {\r
  22 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";\r
  23 \r
  24     /**\r
  25      * Create a normalizer for a given form.\r
  26      */\r
  27     public UnicodeNormalizer(byte form, boolean fullData) {\r
  28         this.form = form;\r
  29         if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time\r
  30     }\r
  31 \r
  32     /**\r
  33     * Masks for the form selector\r
  34     */\r
  35     static final byte\r
  36         COMPATIBILITY_MASK = 1,\r
  37         COMPOSITION_MASK = 2;\r
  38 \r
  39     /**\r
  40     * Normalization Form Selector\r
  41     */\r
  42     public static final byte\r
  43         D = 0 ,\r
  44         C = COMPOSITION_MASK,\r
  45         KD = COMPATIBILITY_MASK,\r
  46         KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);\r
  47 \r
  48     /**\r
  49     * Normalizes text according to the chosen form,\r
  50     * replacing contents of the target buffer.\r
  51     * @param   source      the original text, unnormalized\r
  52     * @param   target      the resulting normalized text\r
  53     */\r
  54     public StringBuffer normalize(String source, StringBuffer target) {\r
  55 \r
  56         // First decompose the source into target,\r
  57         // then compose if the form requires.\r
  58 \r
  59         if (source.length() != 0) {\r
  60             internalDecompose(source, target);\r
  61             if ((form & COMPOSITION_MASK) != 0) {\r
  62                 internalCompose(target);\r
  63             }\r
  64         }\r
  65         return target;\r
  66     }\r
  67 \r
  68     /**\r
  69     * Normalizes text according to the chosen form\r
  70     * @param   source      the original text, unnormalized\r
  71     * @return  target      the resulting normalized text\r
  72     */\r
  73     public String normalize(String source) {\r
  74         return normalize(source, new StringBuffer()).toString();\r
  75     }\r
  76 \r
  77     // ======================================\r
  78     //                  PRIVATES\r
  79     // ======================================\r
  80 \r
  81     /**\r
  82      * The current form.\r
  83      */\r
  84     private byte form;\r
  85 \r
  86     /**\r
  87     * Decomposes text, either canonical or compatibility,\r
  88     * replacing contents of the target buffer.\r
  89     * @param   form        the normalization form. If COMPATIBILITY_MASK\r
  90     *                      bit is on in this byte, then selects the recursive\r
  91     *                      compatibility decomposition, otherwise selects\r
  92     *                      the recursive canonical decomposition.\r
  93     * @param   source      the original text, unnormalized\r
  94     * @param   target      the resulting normalized text\r
  95     */\r
  96     private void internalDecompose(String source, StringBuffer target) {\r
  97         StringBuffer buffer = new StringBuffer();\r
  98         boolean canonical = (form & COMPATIBILITY_MASK) == 0;\r
  99         int ch;\r
 100         for (int i = 0; i < source.length();) {\r
 101             buffer.setLength(0);\r
 102             ch = UTF16Util.nextCodePoint(source, i);\r
 103             i+=UTF16Util.codePointLength(ch);\r
 104             data.getRecursiveDecomposition(canonical, ch, buffer);\r
 105 \r
 106             // add all of the characters in the decomposition.\r
 107             // (may be just the original character, if there was\r
 108             // no decomposition mapping)\r
 109 \r
 110             for (int j = 0; j < buffer.length();) {\r
 111                 ch = UTF16Util.nextCodePoint(buffer, j);\r
 112                 j+=UTF16Util.codePointLength(ch);\r
 113                 int chClass = data.getCanonicalClass(ch);\r
 114                 int k = target.length(); // insertion point\r
 115                 if (chClass != 0) {\r
 116 \r
 117                     // bubble-sort combining marks as necessary\r
 118 \r
 119                     int ch2;\r
 120                     for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {\r
 121                         ch2 = UTF16Util.prevCodePoint(target, k);\r
 122                         if (data.getCanonicalClass(ch2) <= chClass) break;\r
 123                     }\r
 124                 }\r
 125                 UTF16Util.insertCodePoint(target, k, ch);\r
 126             }\r
 127         }\r
 128     }\r
 129 \r
 130     /**\r
 131     * Composes text in place. Target must already\r
 132     * have been decomposed.\r
 133     * @param   target      input: decomposed text.\r
 134     *                      output: the resulting normalized text.\r
 135     */\r
 136     private void internalCompose(StringBuffer target) {\r
 137 \r
 138         int starterPos = 0;\r
 139         int starterCh = UTF16Util.nextCodePoint(target,0);\r
 140         int compPos = UTF16Util.codePointLength(starterCh);\r
 141         int lastClass = data.getCanonicalClass(starterCh);\r
 142         if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence\r
 143 \r
 144         // Loop on the decomposed characters, combining where possible\r
 145 \r
 146         for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {\r
 147             int ch = UTF16Util.nextCodePoint(target, decompPos);\r
 148             decompPos += UTF16Util.codePointLength(ch);\r
 149             int chClass = data.getCanonicalClass(ch);\r
 150             int composite = data.getPairwiseComposition(starterCh, ch);\r
 151             if (composite != NormalizerData.NOT_COMPOSITE\r
 152             && (lastClass < chClass || lastClass == 0)) {\r
 153                 UTF16Util.setCodePointAt(target, starterPos, composite);\r
 154                 starterCh = composite;\r
 155             } else {\r
 156                 if (chClass == 0) {\r
 157                     starterPos = compPos;\r
 158                     starterCh  = ch;\r
 159                 }\r
 160                 lastClass = chClass;\r
 161                 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);\r
 162                 compPos += UTF16Util.codePointLength(ch);\r
 163             }\r
 164         }\r
 165         target.setLength(compPos);\r
 166     }\r
 167 \r
 168     /**\r
 169     * Contains normalization data from the Unicode Character Database.\r
 170     * use false for the minimal set, true for the real set.\r
 171     */\r
 172     private static NormalizerData data = null;\r
 173 \r
 174     /**\r
 175     * Just accessible for testing.\r
 176     */\r
 177     boolean getExcluded (char ch) {\r
 178         return data.getExcluded(ch);\r
 179     }\r
 180 \r
 181     /**\r
 182     * Just accessible for testing.\r
 183     */\r
 184     String getRawDecompositionMapping (char ch) {\r
 185         return data.getRawDecompositionMapping(ch);\r
 186     }\r
 187 }