2 * Copyright (C) 1998-2007 International Business Machines Corporation and
3 * Unicode, Inc. All Rights Reserved.<br>
4 * The Unicode Consortium makes no expressed or implied warranty of any
5 * kind, and assumes no liability for errors or omissions.
6 * No liability is assumed for incidental and consequential damages
7 * in connection with or arising out of the use of the information here.
10 package com.ibm.icu.dev.test.normalizer;
12 import com.ibm.icu.dev.test.UTF16Util;
15 * Implements Unicode Normalization Forms C, D, KC, KD.<br>
16 * See UTR#15 for details.<br>
18 * Updates for supplementary code points:
19 * Vladimir Weinstein & Markus Scherer
21 public class UnicodeNormalizer {
22 // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
25 * Create a normalizer for a given form.
27 public UnicodeNormalizer(byte form, boolean fullData) {
29 if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
33 * Masks for the form selector
36 COMPATIBILITY_MASK = 1,
40 * Normalization Form Selector
42 public static final byte
45 KD = COMPATIBILITY_MASK,
46 KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
49 * Normalizes text according to the chosen form,
50 * replacing contents of the target buffer.
51 * @param source the original text, unnormalized
52 * @param target the resulting normalized text
54 public StringBuffer normalize(String source, StringBuffer target) {
56 // First decompose the source into target,
57 // then compose if the form requires.
59 if (source.length() != 0) {
60 internalDecompose(source, target);
61 if ((form & COMPOSITION_MASK) != 0) {
62 internalCompose(target);
69 * Normalizes text according to the chosen form
70 * @param source the original text, unnormalized
71 * @return target the resulting normalized text
73 public String normalize(String source) {
74 return normalize(source, new StringBuffer()).toString();
77 // ======================================
79 // ======================================
87 * Decomposes text, either canonical or compatibility,
88 * replacing contents of the target buffer.
89 * @param form the normalization form. If COMPATIBILITY_MASK
90 * bit is on in this byte, then selects the recursive
91 * compatibility decomposition, otherwise selects
92 * the recursive canonical decomposition.
93 * @param source the original text, unnormalized
94 * @param target the resulting normalized text
96 private void internalDecompose(String source, StringBuffer target) {
97 StringBuffer buffer = new StringBuffer();
98 boolean canonical = (form & COMPATIBILITY_MASK) == 0;
100 for (int i = 0; i < source.length();) {
102 ch = UTF16Util.nextCodePoint(source, i);
103 i+=UTF16Util.codePointLength(ch);
104 data.getRecursiveDecomposition(canonical, ch, buffer);
106 // add all of the characters in the decomposition.
107 // (may be just the original character, if there was
108 // no decomposition mapping)
110 for (int j = 0; j < buffer.length();) {
111 ch = UTF16Util.nextCodePoint(buffer, j);
112 j+=UTF16Util.codePointLength(ch);
113 int chClass = data.getCanonicalClass(ch);
114 int k = target.length(); // insertion point
117 // bubble-sort combining marks as necessary
120 for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
121 ch2 = UTF16Util.prevCodePoint(target, k);
122 if (data.getCanonicalClass(ch2) <= chClass) break;
125 UTF16Util.insertCodePoint(target, k, ch);
131 * Composes text in place. Target must already
132 * have been decomposed.
133 * @param target input: decomposed text.
134 * output: the resulting normalized text.
136 private void internalCompose(StringBuffer target) {
139 int starterCh = UTF16Util.nextCodePoint(target,0);
140 int compPos = UTF16Util.codePointLength(starterCh);
141 int lastClass = data.getCanonicalClass(starterCh);
142 if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
144 // Loop on the decomposed characters, combining where possible
146 for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
147 int ch = UTF16Util.nextCodePoint(target, decompPos);
148 decompPos += UTF16Util.codePointLength(ch);
149 int chClass = data.getCanonicalClass(ch);
150 int composite = data.getPairwiseComposition(starterCh, ch);
151 if (composite != NormalizerData.NOT_COMPOSITE
152 && (lastClass < chClass || lastClass == 0)) {
153 UTF16Util.setCodePointAt(target, starterPos, composite);
154 starterCh = composite;
157 starterPos = compPos;
161 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
162 compPos += UTF16Util.codePointLength(ch);
165 target.setLength(compPos);
169 * Contains normalization data from the Unicode Character Database.
170 * use false for the minimal set, true for the real set.
172 private static NormalizerData data = null;
175 * Just accessible for testing.
177 boolean getExcluded (char ch) {
178 return data.getExcluded(ch);
182 * Just accessible for testing.
184 String getRawDecompositionMapping (char ch) {
185 return data.getRawDecompositionMapping(ch);