2 * Copyright (C) 1998-2007 International Business Machines Corporation and
\r
3 * Unicode, Inc. All Rights Reserved.<br>
\r
4 * The Unicode Consortium makes no expressed or implied warranty of any
\r
5 * kind, and assumes no liability for errors or omissions.
\r
6 * No liability is assumed for incidental and consequential damages
\r
7 * in connection with or arising out of the use of the information here.
\r
9 package com.ibm.icu.dev.test.normalizer;
\r
11 import java.util.BitSet;
\r
13 import com.ibm.icu.dev.test.UTF16Util;
\r
16 * Accesses the Normalization Data used for Forms C and D.<br>
\r
17 * @author Mark Davis
\r
18 * Updates for supplementary code points:
\r
19 * Vladimir Weinstein & Markus Scherer
\r
21 public class NormalizerData {
\r
22 // static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
\r
25 * Constant for use in getPairwiseComposition
\r
27 public static final int NOT_COMPOSITE = '\uFFFF';
\r
30 * Gets the combining class of a character from the
\r
31 * Unicode Character Database.
\r
32 * @param ch the source character
\r
33 * @return value from 0 to 255
\r
35 public int getCanonicalClass(int ch) {
\r
36 return canonicalClass.get(ch);
\r
40 * Returns the composite of the two characters. If the two
\r
41 * characters don't combine, returns NOT_COMPOSITE.
\r
42 * @param first first character (e.g. 'c')
\r
43 * @param second second character (e.g. \u0327 cedilla)
\r
44 * @return composite (e.g. \u00C7 c cedilla)
\r
46 public int getPairwiseComposition(int first, int second) {
\r
47 return compose.get(((long)first << 32) | second);
\r
52 * Gets recursive decomposition of a character from the
\r
53 * Unicode Character Database.
\r
54 * @param canonical If true
\r
55 * bit is on in this byte, then selects the recursive
\r
56 * canonical decomposition, otherwise selects
\r
57 * the recursive compatibility and canonical decomposition.
\r
58 * @param ch the source character
\r
59 * @param buffer buffer to be filled with the decomposition
\r
61 public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {
\r
62 String decomp = decompose.get(ch);
\r
63 if (decomp != null && !(canonical && isCompatibility.get(ch))) {
\r
64 for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {
\r
65 ch = UTF16Util.nextCodePoint(decomp, i);
\r
66 getRecursiveDecomposition(canonical, ch, buffer);
\r
68 } else { // if no decomp, append
\r
69 UTF16Util.appendCodePoint(buffer, ch);
\r
73 // =================================================
\r
75 // =================================================
\r
78 * Only accessed by NormalizerBuilder.
\r
80 NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,
\r
81 LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
\r
82 this.canonicalClass = canonicalClass;
\r
83 this.decompose = decompose;
\r
84 this.compose = compose;
\r
85 this.isCompatibility = isCompatibility;
\r
86 this.isExcluded = isExcluded;
\r
90 * Just accessible for testing.
\r
92 boolean getExcluded (char ch) {
\r
93 return isExcluded.get(ch);
\r
97 * Just accessible for testing.
\r
99 String getRawDecompositionMapping (char ch) {
\r
100 return decompose.get(ch);
\r
104 * For now, just use IntHashtable
\r
105 * Two-stage tables would be used in an optimized implementation.
\r
107 private IntHashtable canonicalClass;
\r
110 * The main data table maps chars to a 32-bit int.
\r
111 * It holds either a pair: top = first, bottom = second
\r
112 * or singleton: top = 0, bottom = single.
\r
113 * If there is no decomposition, the value is 0.
\r
114 * Two-stage tables would be used in an optimized implementation.
\r
115 * An optimization could also map chars to a small index, then use that
\r
116 * index in a small array of ints.
\r
118 private IntStringHashtable decompose;
\r
121 * Maps from pairs of characters to single.
\r
122 * If there is no decomposition, the value is NOT_COMPOSITE.
\r
124 private LongHashtable compose;
\r
127 * Tells whether decomposition is canonical or not.
\r
129 private BitSet isCompatibility = new BitSet();
\r
132 * Tells whether character is script-excluded or not.
\r
133 * Used only while building, and for testing.
\r
136 private BitSet isExcluded = new BitSet();
\r