]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/tools/misc/src/com/ibm/icu/dev/tool/layout/CanonicalCharacterData.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / tools / misc / src / com / ibm / icu / dev / tool / layout / CanonicalCharacterData.java
1 /**\r
2  *******************************************************************************\r
3  * Copyright (C) 2002-2010, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 \r
8 package com.ibm.icu.dev.tool.layout;\r
9 \r
10 import java.util.Vector;\r
11 \r
12 import com.ibm.icu.impl.Utility;\r
13 import com.ibm.icu.lang.UCharacter;\r
14 import com.ibm.icu.lang.UScript;\r
15 import com.ibm.icu.text.CanonicalIterator;\r
16 import com.ibm.icu.text.UTF16;\r
17 import com.ibm.icu.text.UnicodeSet;\r
18 \r
19 public class CanonicalCharacterData\r
20 {\r
21     private static int THRESHOLD = 4;\r
22     \r
23     public class Record\r
24     {\r
25         // TODO: might want to save arrays of Char32's rather than UTF16 strings...\r
26         Record(int character, int script)\r
27         {\r
28             String char32 = UCharacter.toString(character);\r
29             CanonicalIterator iterator = new CanonicalIterator(char32);\r
30             Vector equivs = new Vector();\r
31             \r
32             composed = character;\r
33             \r
34             for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) {\r
35                 // Skip all equivalents of length 1; it's either the original\r
36                 // characeter or something like Angstrom for A-Ring, which we don't care about\r
37                 if (UTF16.countCodePoint(equiv) > 1) {\r
38                     equivs.add(equiv);\r
39                 }\r
40             }\r
41             \r
42             int nEquivalents = equivs.size();\r
43             \r
44             if (nEquivalents > maxEquivalents[script]) {\r
45                 maxEquivalents[script] = nEquivalents;\r
46             }\r
47             \r
48             if (nEquivalents > 0) {\r
49                 equivalents = new String[nEquivalents];\r
50                 \r
51                 if (nEquivalents > THRESHOLD) {\r
52                     dumpEquivalents(character, equivs);\r
53                 }\r
54                 \r
55                 sortEquivalents(equivalents, equivs);\r
56             }\r
57         }\r
58         \r
59         public int getComposedCharacter()\r
60         {\r
61             return composed;\r
62         }\r
63         \r
64         public int countEquivalents()\r
65         {\r
66             if (equivalents == null) {\r
67                 return 0;\r
68             }\r
69             \r
70             return equivalents.length;\r
71         }\r
72         \r
73         public String[] getEquivalents()\r
74         {\r
75             return equivalents;\r
76         }\r
77         \r
78         public String getEquivalent(int index)\r
79         {\r
80             if (equivalents == null || index < 0 || index >= equivalents.length) {\r
81                 return null;\r
82             }\r
83             \r
84             return equivalents[index];\r
85         }\r
86         \r
87         private void dumpEquivalents(int character, Vector equivs)\r
88         {\r
89             int count = equivs.size();\r
90             \r
91             System.out.println(Utility.hex(character, 6) + " - " + count + ":");\r
92             \r
93             for (int i = 0; i < count; i += 1) {\r
94                 String equiv = (String) equivs.elementAt(i);\r
95                 int codePoints = UTF16.countCodePoint(equiv);\r
96                 \r
97                 for (int c = 0; c < codePoints; c += 1) {\r
98                     if (c > 0) {\r
99                         System.out.print(" ");\r
100                     }\r
101                     \r
102                     System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6));\r
103                 }\r
104                 \r
105                 System.out.println();\r
106             }\r
107             \r
108             System.out.println();\r
109         }\r
110         \r
111         private int composed;\r
112         private String[] equivalents = null;\r
113     }\r
114     \r
115     public CanonicalCharacterData()\r
116     {\r
117         // nothing to do...\r
118     }\r
119     \r
120     public void add(int character)\r
121     {\r
122         int script = UScript.getScript(character);\r
123         Vector recordVector = recordVectors[script];\r
124         \r
125         if (recordVector == null) {\r
126             recordVector = recordVectors[script] = new Vector();\r
127         }\r
128         \r
129         recordVector.add(new Record(character, script));\r
130     }\r
131     \r
132     public int getMaxEquivalents(int script)\r
133     {\r
134         if (script < 0 || script >= UScript.CODE_LIMIT) {\r
135             return 0;\r
136         }\r
137         \r
138         return maxEquivalents[script];\r
139     }\r
140     \r
141     public Record getRecord(int script, int index)\r
142     {\r
143         if (script < 0 || script >= UScript.CODE_LIMIT) {\r
144             return null;\r
145         }\r
146         \r
147         Vector recordVector = recordVectors[script];\r
148         \r
149         if (recordVector == null || index < 0 || index >= recordVector.size()) {\r
150             return null;\r
151         }\r
152         \r
153         return (Record) recordVector.elementAt(index);\r
154     }\r
155     \r
156     public int countRecords(int script)\r
157     {\r
158         if (script < 0 || script >= UScript.CODE_LIMIT ||\r
159             recordVectors[script] == null) {\r
160             return 0;\r
161         }\r
162         \r
163         return recordVectors[script].size();\r
164     }\r
165  \r
166     public static CanonicalCharacterData factory(UnicodeSet characterSet)\r
167     {\r
168         int charCount = characterSet.size();\r
169         CanonicalCharacterData data = new CanonicalCharacterData();\r
170         \r
171         System.out.println("There are " + charCount + " characters with a canonical decomposition.");\r
172         \r
173         for (int i = 0; i < charCount; i += 1) {\r
174             data.add(characterSet.charAt(i));\r
175         }\r
176         \r
177         return data;\r
178     }\r
179 \r
180     private static int compareEquivalents(String a, String b)\r
181     {\r
182         int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);\r
183             \r
184         if (result == 0) {\r
185             return a.compareTo(b);\r
186         }\r
187             \r
188         return result;\r
189     }\r
190         \r
191     //\r
192     // Straight insertion sort from Knuth vol. III, pg. 81\r
193     //\r
194     private static void sortEquivalents(String[] equivalents, Vector unsorted)\r
195     {\r
196         int nEquivalents = equivalents.length;\r
197         \r
198         for (int e = 0; e < nEquivalents; e += 1) {\r
199             String v = (String) unsorted.elementAt(e);\r
200             int i;\r
201             \r
202             for (i = e - 1; i >= 0; i -= 1) {\r
203                 if (compareEquivalents(v, equivalents[i]) >= 0) {\r
204                   break;\r
205                 }\r
206 \r
207                 equivalents[i + 1] = equivalents[i];\r
208             }\r
209 \r
210             equivalents[i + 1] = v;\r
211        }\r
212     }\r
213             \r
214     private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];\r
215     private int maxEquivalents[] = new int[UScript.CODE_LIMIT];\r
216 \r
217 }\r