2 * Copyright (C) 1998-2007 International Business Machines Corporation and
3 * Unicode, Inc. All Rights Reserved.<br>
4 * The Unicode Consortium makes no expressed or implied warranty of any
5 * kind, and assumes no liability for errors or omissions.
6 * No liability is assumed for incidental and consequential damages
7 * in connection with or arising out of the use of the information here.
9 package com.ibm.icu.dev.test.normalizer;
11 import java.util.BitSet;
13 import com.ibm.icu.dev.test.UTF16Util;
16 * Accesses the Normalization Data used for Forms C and D.<br>
18 * Updates for supplementary code points:
19 * Vladimir Weinstein & Markus Scherer
21 public class NormalizerData {
22 // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
25 * Constant for use in getPairwiseComposition
27 public static final int NOT_COMPOSITE = '\uFFFF';
30 * Gets the combining class of a character from the
31 * Unicode Character Database.
32 * @param ch the source character
33 * @return value from 0 to 255
35 public int getCanonicalClass(int ch) {
36 return canonicalClass.get(ch);
40 * Returns the composite of the two characters. If the two
41 * characters don't combine, returns NOT_COMPOSITE.
42 * @param first first character (e.g. 'c')
43 * @param second second character (e.g. \u0327 cedilla)
44 * @return composite (e.g. \u00C7 c cedilla)
46 public int getPairwiseComposition(int first, int second) {
47 return compose.get(((long)first << 32) | second);
52 * Gets recursive decomposition of a character from the
53 * Unicode Character Database.
54 * @param canonical If true
55 * bit is on in this byte, then selects the recursive
56 * canonical decomposition, otherwise selects
57 * the recursive compatibility and canonical decomposition.
58 * @param ch the source character
59 * @param buffer buffer to be filled with the decomposition
61 public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {
62 String decomp = decompose.get(ch);
63 if (decomp != null && !(canonical && isCompatibility.get(ch))) {
64 for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {
65 ch = UTF16Util.nextCodePoint(decomp, i);
66 getRecursiveDecomposition(canonical, ch, buffer);
68 } else { // if no decomp, append
69 UTF16Util.appendCodePoint(buffer, ch);
73 // =================================================
75 // =================================================
78 * Only accessed by NormalizerBuilder.
80 NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,
81 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
82 this.canonicalClass = canonicalClass;
83 this.decompose = decompose;
84 this.compose = compose;
85 this.isCompatibility = isCompatibility;
86 this.isExcluded = isExcluded;
90 * Just accessible for testing.
92 boolean getExcluded (char ch) {
93 return isExcluded.get(ch);
97 * Just accessible for testing.
99 String getRawDecompositionMapping (char ch) {
100 return decompose.get(ch);
104 * For now, just use IntHashtable
105 * Two-stage tables would be used in an optimized implementation.
107 private IntHashtable canonicalClass;
110 * The main data table maps chars to a 32-bit int.
111 * It holds either a pair: top = first, bottom = second
112 * or singleton: top = 0, bottom = single.
113 * If there is no decomposition, the value is 0.
114 * Two-stage tables would be used in an optimized implementation.
115 * An optimization could also map chars to a small index, then use that
116 * index in a small array of ints.
118 private IntStringHashtable decompose;
121 * Maps from pairs of characters to single.
122 * If there is no decomposition, the value is NOT_COMPOSITE.
124 private LongHashtable compose;
127 * Tells whether decomposition is canonical or not.
129 private BitSet isCompatibility = new BitSet();
132 * Tells whether character is script-excluded or not.
133 * Used only while building, and for testing.
136 private BitSet isExcluded = new BitSet();