2 * Copyright (C) 1998-2007 International Business Machines Corporation and
\r
3 * Unicode, Inc. All Rights Reserved.<br>
\r
4 * The Unicode Consortium makes no expressed or implied warranty of any
\r
5 * kind, and assumes no liability for errors or omissions.
\r
6 * No liability is assumed for incidental and consequential damages
\r
7 * in connection with or arising out of the use of the information here.
\r
10 package com.ibm.icu.dev.test.normalizer;
\r
12 import com.ibm.icu.dev.test.UTF16Util;
\r
15 * Implements Unicode Normalization Forms C, D, KC, KD.<br>
\r
16 * See UTR#15 for details.<br>
\r
17 * @author Mark Davis
\r
18 * Updates for supplementary code points:
\r
19 * Vladimir Weinstein & Markus Scherer
\r
21 public class UnicodeNormalizer {
\r
22 // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
\r
25 * Create a normalizer for a given form.
\r
27 public UnicodeNormalizer(byte form, boolean fullData) {
\r
29 if (data == null) data = NormalizerBuilder.build(fullData); // load 1st time
\r
33 * Masks for the form selector
\r
36 COMPATIBILITY_MASK = 1,
\r
37 COMPOSITION_MASK = 2;
\r
40 * Normalization Form Selector
\r
42 public static final byte
\r
44 C = COMPOSITION_MASK,
\r
45 KD = COMPATIBILITY_MASK,
\r
46 KC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
\r
49 * Normalizes text according to the chosen form,
\r
50 * replacing contents of the target buffer.
\r
51 * @param source the original text, unnormalized
\r
52 * @param target the resulting normalized text
\r
54 public StringBuffer normalize(String source, StringBuffer target) {
\r
56 // First decompose the source into target,
\r
57 // then compose if the form requires.
\r
59 if (source.length() != 0) {
\r
60 internalDecompose(source, target);
\r
61 if ((form & COMPOSITION_MASK) != 0) {
\r
62 internalCompose(target);
\r
69 * Normalizes text according to the chosen form
\r
70 * @param source the original text, unnormalized
\r
71 * @return target the resulting normalized text
\r
73 public String normalize(String source) {
\r
74 return normalize(source, new StringBuffer()).toString();
\r
77 // ======================================
\r
79 // ======================================
\r
87 * Decomposes text, either canonical or compatibility,
\r
88 * replacing contents of the target buffer.
\r
89 * @param form the normalization form. If COMPATIBILITY_MASK
\r
90 * bit is on in this byte, then selects the recursive
\r
91 * compatibility decomposition, otherwise selects
\r
92 * the recursive canonical decomposition.
\r
93 * @param source the original text, unnormalized
\r
94 * @param target the resulting normalized text
\r
96 private void internalDecompose(String source, StringBuffer target) {
\r
97 StringBuffer buffer = new StringBuffer();
\r
98 boolean canonical = (form & COMPATIBILITY_MASK) == 0;
\r
100 for (int i = 0; i < source.length();) {
\r
101 buffer.setLength(0);
\r
102 ch = UTF16Util.nextCodePoint(source, i);
\r
103 i+=UTF16Util.codePointLength(ch);
\r
104 data.getRecursiveDecomposition(canonical, ch, buffer);
\r
106 // add all of the characters in the decomposition.
\r
107 // (may be just the original character, if there was
\r
108 // no decomposition mapping)
\r
110 for (int j = 0; j < buffer.length();) {
\r
111 ch = UTF16Util.nextCodePoint(buffer, j);
\r
112 j+=UTF16Util.codePointLength(ch);
\r
113 int chClass = data.getCanonicalClass(ch);
\r
114 int k = target.length(); // insertion point
\r
115 if (chClass != 0) {
\r
117 // bubble-sort combining marks as necessary
\r
120 for (; k > 0; k -= UTF16Util.codePointLength(ch2)) {
\r
121 ch2 = UTF16Util.prevCodePoint(target, k);
\r
122 if (data.getCanonicalClass(ch2) <= chClass) break;
\r
125 UTF16Util.insertCodePoint(target, k, ch);
\r
131 * Composes text in place. Target must already
\r
132 * have been decomposed.
\r
133 * @param target input: decomposed text.
\r
134 * output: the resulting normalized text.
\r
136 private void internalCompose(StringBuffer target) {
\r
138 int starterPos = 0;
\r
139 int starterCh = UTF16Util.nextCodePoint(target,0);
\r
140 int compPos = UTF16Util.codePointLength(starterCh);
\r
141 int lastClass = data.getCanonicalClass(starterCh);
\r
142 if (lastClass != 0) lastClass = 256; // fix for irregular combining sequence
\r
144 // Loop on the decomposed characters, combining where possible
\r
146 for (int decompPos = UTF16Util.codePointLength(starterCh); decompPos < target.length(); ) {
\r
147 int ch = UTF16Util.nextCodePoint(target, decompPos);
\r
148 decompPos += UTF16Util.codePointLength(ch);
\r
149 int chClass = data.getCanonicalClass(ch);
\r
150 int composite = data.getPairwiseComposition(starterCh, ch);
\r
151 if (composite != NormalizerData.NOT_COMPOSITE
\r
152 && (lastClass < chClass || lastClass == 0)) {
\r
153 UTF16Util.setCodePointAt(target, starterPos, composite);
\r
154 starterCh = composite;
\r
156 if (chClass == 0) {
\r
157 starterPos = compPos;
\r
160 lastClass = chClass;
\r
161 decompPos += UTF16Util.setCodePointAt(target, compPos, ch);
\r
162 compPos += UTF16Util.codePointLength(ch);
\r
165 target.setLength(compPos);
\r
169 * Contains normalization data from the Unicode Character Database.
\r
170 * use false for the minimal set, true for the real set.
\r
172 private static NormalizerData data = null;
\r
175 * Just accessible for testing.
\r
177 boolean getExcluded (char ch) {
\r
178 return data.getExcluded(ch);
\r
182 * Just accessible for testing.
\r
184 String getRawDecompositionMapping (char ch) {
\r
185 return data.getRawDecompositionMapping(ch);
\r