]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/tool/layout/CanonicalCharacterData.java
go
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / tool / layout / CanonicalCharacterData.java
1 /**\r
2  *******************************************************************************\r
3  * Copyright (C) 2002-2005, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 \r
8 package com.ibm.icu.dev.tool.layout;\r
9 \r
10 import com.ibm.icu.impl.Utility;\r
11 import com.ibm.icu.lang.UCharacter;\r
12 import com.ibm.icu.lang.UScript;\r
13 import com.ibm.icu.text.UnicodeSet;\r
14 import com.ibm.icu.text.CanonicalIterator;\r
15 import com.ibm.icu.text.UTF16;\r
16 import java.util.Vector;\r
17 \r
18 public class CanonicalCharacterData\r
19 {\r
20     private static int THRESHOLD = 4;\r
21     \r
22     public class Record\r
23     {\r
24         // TODO: might want to save arrays of Char32's rather than UTF16 strings...\r
25         Record(int character, int script)\r
26         {\r
27             String char32 = UCharacter.toString(character);\r
28             CanonicalIterator iterator = new CanonicalIterator(char32);\r
29             Vector equivs = new Vector();\r
30             \r
31             composed = character;\r
32             \r
33             for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) {\r
34                 // Skip all equivalents of length 1; it's either the original\r
35                 // characeter or something like Angstrom for A-Ring, which we don't care about\r
36                 if (UTF16.countCodePoint(equiv) > 1) {\r
37                     equivs.add(equiv);\r
38                 }\r
39             }\r
40             \r
41             int nEquivalents = equivs.size();\r
42             \r
43             if (nEquivalents > maxEquivalents[script]) {\r
44                 maxEquivalents[script] = nEquivalents;\r
45             }\r
46             \r
47             if (nEquivalents > 0) {\r
48                 equivalents = new String[nEquivalents];\r
49                 \r
50                 if (nEquivalents > THRESHOLD) {\r
51                     dumpEquivalents(character, equivs);\r
52                 }\r
53                 \r
54                 sortEquivalents(equivalents, equivs);\r
55             }\r
56         }\r
57         \r
58         public int getComposedCharacter()\r
59         {\r
60             return composed;\r
61         }\r
62         \r
63         public int countEquivalents()\r
64         {\r
65             if (equivalents == null) {\r
66                 return 0;\r
67             }\r
68             \r
69             return equivalents.length;\r
70         }\r
71         \r
72         public String[] getEquivalents()\r
73         {\r
74             return equivalents;\r
75         }\r
76         \r
77         public String getEquivalent(int index)\r
78         {\r
79             if (equivalents == null || index < 0 || index >= equivalents.length) {\r
80                 return null;\r
81             }\r
82             \r
83             return equivalents[index];\r
84         }\r
85         \r
86         private void dumpEquivalents(int character, Vector equivs)\r
87         {\r
88             int count = equivs.size();\r
89             \r
90             System.out.println(Utility.hex(character, 6) + " - " + count + ":");\r
91             \r
92             for (int i = 0; i < count; i += 1) {\r
93                 String equiv = (String) equivs.elementAt(i);\r
94                 int codePoints = UTF16.countCodePoint(equiv);\r
95                 \r
96                 for (int c = 0; c < codePoints; c += 1) {\r
97                     if (c > 0) {\r
98                         System.out.print(" ");\r
99                     }\r
100                     \r
101                     System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6));\r
102                 }\r
103                 \r
104                 System.out.println();\r
105             }\r
106             \r
107             System.out.println();\r
108         }\r
109         \r
110         private int composed;\r
111         private String[] equivalents = null;\r
112     }\r
113     \r
114     public CanonicalCharacterData()\r
115     {\r
116         // nothing to do...\r
117     }\r
118     \r
119     public void add(int character)\r
120     {\r
121         int script = UScript.getScript(character);\r
122         Vector recordVector = recordVectors[script];\r
123         \r
124         if (recordVector == null) {\r
125             recordVector = recordVectors[script] = new Vector();\r
126         }\r
127         \r
128         recordVector.add(new Record(character, script));\r
129     }\r
130     \r
131     public int getMaxEquivalents(int script)\r
132     {\r
133         if (script < 0 || script >= UScript.CODE_LIMIT) {\r
134             return 0;\r
135         }\r
136         \r
137         return maxEquivalents[script];\r
138     }\r
139     \r
140     public Record getRecord(int script, int index)\r
141     {\r
142         if (script < 0 || script >= UScript.CODE_LIMIT) {\r
143             return null;\r
144         }\r
145         \r
146         Vector recordVector = recordVectors[script];\r
147         \r
148         if (recordVector == null || index < 0 || index >= recordVector.size()) {\r
149             return null;\r
150         }\r
151         \r
152         return (Record) recordVector.elementAt(index);\r
153     }\r
154     \r
155     public int countRecords(int script)\r
156     {\r
157         if (script < 0 || script >= UScript.CODE_LIMIT ||\r
158             recordVectors[script] == null) {\r
159             return 0;\r
160         }\r
161         \r
162         return recordVectors[script].size();\r
163     }\r
164  \r
165     public static CanonicalCharacterData factory(UnicodeSet characterSet)\r
166     {\r
167         int charCount = characterSet.size();\r
168         CanonicalCharacterData data = new CanonicalCharacterData();\r
169         \r
170         System.out.println("There are " + charCount + " characters with a canonical decomposition.");\r
171         \r
172         for (int i = 0; i < charCount; i += 1) {\r
173             data.add(characterSet.charAt(i));\r
174         }\r
175         \r
176         return data;\r
177     }\r
178 \r
179     private static int compareEquivalents(String a, String b)\r
180     {\r
181         int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);\r
182             \r
183         if (result == 0) {\r
184             return a.compareTo(b);\r
185         }\r
186             \r
187         return result;\r
188     }\r
189         \r
190     //\r
191     // Straight insertion sort from Knuth vol. III, pg. 81\r
192     //\r
193     private static void sortEquivalents(String[] equivalents, Vector unsorted)\r
194     {\r
195         int nEquivalents = equivalents.length;\r
196         \r
197         for (int e = 0; e < nEquivalents; e += 1) {\r
198             String v = (String) unsorted.elementAt(e);\r
199             int i;\r
200             \r
201             for (i = e - 1; i >= 0; i -= 1) {\r
202                 if (compareEquivalents(v, equivalents[i]) >= 0) {\r
203                   break;\r
204                 }\r
205 \r
206                 equivalents[i + 1] = equivalents[i];\r
207             }\r
208 \r
209             equivalents[i + 1] = v;\r
210        }\r
211     }\r
212             \r
213     private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];\r
214     private int maxEquivalents[] = new int[UScript.CODE_LIMIT];\r
215 \r
216 }\r