jars/icu4j-4_4_2-src/main/tests/core/src/com/ibm/icu/dev/test/normalizer/NormalizerData.java

   1 /*\r
   2  * Copyright (C) 1998-2007 International Business Machines Corporation and\r
   3  * Unicode, Inc. All Rights Reserved.<br>\r
   4  * The Unicode Consortium makes no expressed or implied warranty of any\r
   5  * kind, and assumes no liability for errors or omissions.\r
   6  * No liability is assumed for incidental and consequential damages\r
   7  * in connection with or arising out of the use of the information here.\r
   8  */\r
   9 package com.ibm.icu.dev.test.normalizer;\r
  10 \r
  11 import java.util.BitSet;\r
  12 \r
  13 import com.ibm.icu.dev.test.UTF16Util;\r
  14 \r
  15 /**\r
  16  * Accesses the Normalization Data used for Forms C and D.<br>\r
  17  * @author Mark Davis\r
  18  * Updates for supplementary code points:\r
  19  * Vladimir Weinstein & Markus Scherer\r
  20  */\r
  21 public class NormalizerData {\r
  22 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";\r
  23 \r
  24     /**\r
  25     * Constant for use in getPairwiseComposition\r
  26     */\r
  27     public static final int NOT_COMPOSITE = '\uFFFF';\r
  28 \r
  29     /**\r
  30     * Gets the combining class of a character from the\r
  31     * Unicode Character Database.\r
  32     * @param   ch      the source character\r
  33     * @return          value from 0 to 255\r
  34     */\r
  35     public int getCanonicalClass(int ch) {\r
  36         return canonicalClass.get(ch);\r
  37     }\r
  38 \r
  39     /**\r
  40     * Returns the composite of the two characters. If the two\r
  41     * characters don't combine, returns NOT_COMPOSITE.\r
  42     * @param   first   first character (e.g. 'c')\r
  43     * @param   second  second character (e.g. \u0327 cedilla)\r
  44     * @return          composite (e.g. \u00C7 c cedilla)\r
  45     */\r
  46     public int getPairwiseComposition(int first, int second) {\r
  47         return compose.get(((long)first << 32) | second);\r
  48     }\r
  49 \r
  50 \r
  51     /**\r
  52     * Gets recursive decomposition of a character from the\r
  53     * Unicode Character Database.\r
  54     * @param   canonical    If true\r
  55     *                  bit is on in this byte, then selects the recursive\r
  56     *                  canonical decomposition, otherwise selects\r
  57     *                  the recursive compatibility and canonical decomposition.\r
  58     * @param   ch      the source character\r
  59     * @param   buffer  buffer to be filled with the decomposition\r
  60     */\r
  61     public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {\r
  62         String decomp = decompose.get(ch);\r
  63         if (decomp != null && !(canonical && isCompatibility.get(ch))) {\r
  64             for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {\r
  65                 ch = UTF16Util.nextCodePoint(decomp, i);\r
  66                 getRecursiveDecomposition(canonical, ch, buffer);\r
  67             }\r
  68         } else {                    // if no decomp, append\r
  69             UTF16Util.appendCodePoint(buffer, ch);\r
  70         }\r
  71     }\r
  72 \r
  73     // =================================================\r
  74     //                   PRIVATES\r
  75     // =================================================\r
  76 \r
  77     /**\r
  78      * Only accessed by NormalizerBuilder.\r
  79      */\r
  80     NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,\r
  81       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {\r
  82         this.canonicalClass = canonicalClass;\r
  83         this.decompose = decompose;\r
  84         this.compose = compose;\r
  85         this.isCompatibility = isCompatibility;\r
  86         this.isExcluded = isExcluded;\r
  87     }\r
  88 \r
  89     /**\r
  90     * Just accessible for testing.\r
  91     */\r
  92     boolean getExcluded (char ch) {\r
  93         return isExcluded.get(ch);\r
  94     }\r
  95 \r
  96     /**\r
  97     * Just accessible for testing.\r
  98     */\r
  99     String getRawDecompositionMapping (char ch) {\r
 100         return decompose.get(ch);\r
 101     }\r
 102 \r
 103     /**\r
 104     * For now, just use IntHashtable\r
 105     * Two-stage tables would be used in an optimized implementation.\r
 106     */\r
 107     private IntHashtable canonicalClass;\r
 108 \r
 109     /**\r
 110     * The main data table maps chars to a 32-bit int.\r
 111     * It holds either a pair: top = first, bottom = second\r
 112     * or singleton: top = 0, bottom = single.\r
 113     * If there is no decomposition, the value is 0.\r
 114     * Two-stage tables would be used in an optimized implementation.\r
 115     * An optimization could also map chars to a small index, then use that\r
 116     * index in a small array of ints.\r
 117     */\r
 118     private IntStringHashtable decompose;\r
 119 \r
 120     /**\r
 121     * Maps from pairs of characters to single.\r
 122     * If there is no decomposition, the value is NOT_COMPOSITE.\r
 123     */\r
 124     private LongHashtable compose;\r
 125 \r
 126     /**\r
 127     * Tells whether decomposition is canonical or not.\r
 128     */\r
 129     private BitSet isCompatibility = new BitSet();\r
 130 \r
 131     /**\r
 132     * Tells whether character is script-excluded or not.\r
 133     * Used only while building, and for testing.\r
 134     */\r
 135 \r
 136     private BitSet isExcluded = new BitSet();\r
 137 }\r