jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UnicodeNormalizer.java

   1 /*
   2  * Copyright (C) 1998-2007 International Business Machines Corporation and
   3  * Unicode, Inc. All Rights Reserved.<br>
   4  * The Unicode Consortium makes no expressed or implied warranty of any
   5  * kind, and assumes no liability for errors or omissions.
   6  * No liability is assumed for incidental and consequential damages
   7  * in connection with or arising out of the use of the information here.
   8  */
   9
  10 package com.ibm.icu.dev.test.normalizer;
  11
  12 import com.ibm.icu.dev.test.UTF16Util;
  13
  14 /**
  15  * Implements Unicode Normalization Forms C, D, KC, KD.<br>
  16  * See UTR#15 for details.<br>
  17  * @author Mark Davis
  18  * Updates for supplementary code points:
  19  * Vladimir Weinstein & Markus Scherer
  20  */
  21 public class UnicodeNormalizer {
  22 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
  23
  24     /**
  25      * Create a normalizer for a given form.
  26      */
  27     public UnicodeNormalizer(byte form, boolean fullData) {
  28         this.form = form;
  29         if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
  30     }
  31
  32     /**
  33     * Masks for the form selector
  34     */
  35     static final byte
  36         COMPATIBILITY_MASK = 1,
  37         COMPOSITION_MASK = 2;
  38
  39     /**
  40     * Normalization Form Selector
  41     */
  42     public static final byte
  43         D = 0 ,
  44         C = COMPOSITION_MASK,
  45         KD = COMPATIBILITY_MASK,
  46         KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
  47
  48     /**
  49     * Normalizes text according to the chosen form,
  50     * replacing contents of the target buffer.
  51     * @param   source      the original text, unnormalized
  52     * @param   target      the resulting normalized text
  53     */
  54     public StringBuffer normalize(String source, StringBuffer target) {
  55
  56         // First decompose the source into target,
  57         // then compose if the form requires.
  58
  59         if (source.length() != 0) {
  60             internalDecompose(source, target);
  61             if ((form & COMPOSITION_MASK) != 0) {
  62                 internalCompose(target);
  63             }
  64         }
  65         return target;
  66     }
  67
  68     /**
  69     * Normalizes text according to the chosen form
  70     * @param   source      the original text, unnormalized
  71     * @return  target      the resulting normalized text
  72     */
  73     public String normalize(String source) {
  74         return normalize(source, new StringBuffer()).toString();
  75     }
  76
  77     // ======================================
  78     //                  PRIVATES
  79     // ======================================
  80
  81     /**
  82      * The current form.
  83      */
  84     private byte form;
  85
  86     /**
  87     * Decomposes text, either canonical or compatibility,
  88     * replacing contents of the target buffer.
  89     * @param   form        the normalization form. If COMPATIBILITY_MASK
  90     *                      bit is on in this byte, then selects the recursive
  91     *                      compatibility decomposition, otherwise selects
  92     *                      the recursive canonical decomposition.
  93     * @param   source      the original text, unnormalized
  94     * @param   target      the resulting normalized text
  95     */
  96     private void internalDecompose(String source, StringBuffer target) {
  97         StringBuffer buffer = new StringBuffer();
  98         boolean canonical = (form & COMPATIBILITY_MASK) == 0;
  99         int ch;
 100         for (int i = 0; i < source.length();) {
 101             buffer.setLength(0);
 102             ch = UTF16Util.nextCodePoint(source, i);
 103             i+=UTF16Util.codePointLength(ch);
 104             data.getRecursiveDecomposition(canonical, ch, buffer);
 105
 106             // add all of the characters in the decomposition.
 107             // (may be just the original character, if there was
 108             // no decomposition mapping)
 109
 110             for (int j = 0; j < buffer.length();) {
 111                 ch = UTF16Util.nextCodePoint(buffer, j);
 112                 j+=UTF16Util.codePointLength(ch);
 113                 int chClass = data.getCanonicalClass(ch);
 114                 int k = target.length(); // insertion point
 115                 if (chClass != 0) {
 116
 117                     // bubble-sort combining marks as necessary
 118
 119                     int ch2;
 120                     for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
 121                         ch2 = UTF16Util.prevCodePoint(target, k);
 122                         if (data.getCanonicalClass(ch2) <= chClass) break;
 123                     }
 124                 }
 125                 UTF16Util.insertCodePoint(target, k, ch);
 126             }
 127         }
 128     }
 129
 130     /**
 131     * Composes text in place. Target must already
 132     * have been decomposed.
 133     * @param   target      input: decomposed text.
 134     *                      output: the resulting normalized text.
 135     */
 136     private void internalCompose(StringBuffer target) {
 137
 138         int starterPos = 0;
 139         int starterCh = UTF16Util.nextCodePoint(target,0);
 140         int compPos = UTF16Util.codePointLength(starterCh);
 141         int lastClass = data.getCanonicalClass(starterCh);
 142         if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
 143
 144         // Loop on the decomposed characters, combining where possible
 145
 146         for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
 147             int ch = UTF16Util.nextCodePoint(target, decompPos);
 148             decompPos += UTF16Util.codePointLength(ch);
 149             int chClass = data.getCanonicalClass(ch);
 150             int composite = data.getPairwiseComposition(starterCh, ch);
 151             if (composite != NormalizerData.NOT_COMPOSITE
 152             && (lastClass < chClass || lastClass == 0)) {
 153                 UTF16Util.setCodePointAt(target, starterPos, composite);
 154                 starterCh = composite;
 155             } else {
 156                 if (chClass == 0) {
 157                     starterPos = compPos;
 158                     starterCh  = ch;
 159                 }
 160                 lastClass = chClass;
 161                 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
 162                 compPos += UTF16Util.codePointLength(ch);
 163             }
 164         }
 165         target.setLength(compPos);
 166     }
 167
 168     /**
 169     * Contains normalization data from the Unicode Character Database.
 170     * use false for the minimal set, true for the real set.
 171     */
 172     private static NormalizerData data = null;
 173
 174     /**
 175     * Just accessible for testing.
 176     */
 177     boolean getExcluded (char ch) {
 178         return data.getExcluded(ch);
 179     }
 180
 181     /**
 182     * Just accessible for testing.
 183     */
 184     String getRawDecompositionMapping (char ch) {
 185         return data.getRawDecompositionMapping(ch);
 186     }
 187 }