2 *******************************************************************************
\r
3 * Copyright (C) 2006-2008, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.charset;
\r
11 import com.ibm.icu.impl.ICUBinary;
\r
14 /* Format of cnvalias.icu -----------------------------------------------------
\r
16 * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
\r
17 * This binary form contains several tables. All indexes are to uint16_t
\r
18 * units, and not to the bytes (uint8_t units). Addressing everything on
\r
19 * 16-bit boundaries allows us to store more information with small index
\r
20 * numbers, which are also 16-bit in size. The majority of the table (except
\r
21 * the string table) are 16-bit numbers.
\r
23 * First there is the size of the Table of Contents (TOC). The TOC
\r
24 * entries contain the size of each section. In order to find the offset
\r
25 * you just need to sum up the previous offsets.
\r
26 * The TOC length and entries are an array of uint32_t values.
\r
27 * The first section after the TOC starts immediately after the TOC.
\r
29 * 1) This section contains a list of converters. This list contains indexes
\r
30 * into the string table for the converter name. The index of this list is
\r
31 * also used by other sections, which are mentioned later on.
\r
32 * This list is not sorted.
\r
34 * 2) This section contains a list of tags. This list contains indexes
\r
35 * into the string table for the tag name. The index of this list is
\r
36 * also used by other sections, which are mentioned later on.
\r
37 * This list is in priority order of standards.
\r
39 * 3) This section contains a list of sorted unique aliases. This
\r
40 * list contains indexes into the string table for the alias name. The
\r
41 * index of this list is also used by other sections, like the 4th section.
\r
42 * The index for the 3rd and 4th section is used to get the
\r
43 * alias -> converter name mapping. Section 3 and 4 form a two column table.
\r
45 * 4) This section contains a list of mapped converter names. Consider this
\r
46 * as a table that maps the 3rd section to the 1st section. This list contains
\r
47 * indexes into the 1st section. The index of this list is the same index in
\r
48 * the 3rd section. There is also some extra information in the high bits of
\r
49 * each converter index in this table. Currently it's only used to say that
\r
50 * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
\r
51 * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
\r
52 * the predigested form of the 5th section so that an alias lookup can be fast.
\r
54 * 5) This section contains a 2D array with indexes to the 6th section. This
\r
55 * section is the full form of all alias mappings. The column index is the
\r
56 * index into the converter list (column header). The row index is the index
\r
57 * to tag list (row header). This 2D array is the top part a 3D array. The
\r
58 * third dimension is in the 6th section.
\r
60 * 6) This is blob of variable length arrays. Each array starts with a size,
\r
61 * and is followed by indexes to alias names in the string table. This is
\r
62 * the third dimension to the section 5. No other section should be referencing
\r
65 * 7) Reserved at this time (There is no information). This _usually_ has a
\r
66 * size of 0. Future versions may add more information here.
\r
68 * 8) This is the string table. All strings are indexed on an even address.
\r
69 * There are two reasons for this. First many chip architectures locate strings
\r
70 * faster on even address boundaries. Second, since all indexes are 16-bit
\r
71 * numbers, this string table can be 128KB in size instead of 64KB when we
\r
72 * only have strings starting on an even address.
\r
75 * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
\r
76 * has a unique alias among all converters. That same alias can
\r
77 * be mentioned in other standards on different converters,
\r
78 * but only one alias per tag can be unique.
\r
81 * Converter Names (Usually in TR22 form)
\r
82 * -------------------------------------------.
\r
88 * ------------------------------------------/ |
\r
96 * -------------------------------------------
\r
100 * Here is what it really looks like. It's like swiss cheese.
\r
101 * There are holes. Some converters aren't recognized by
\r
102 * a standard, or they are really old converters that the
\r
103 * standard doesn't recognize anymore.
\r
105 * Converter Names (Usually in TR22 form)
\r
106 * -------------------------------------------.
\r
107 * T /##########################################/|
\r
109 * g / # ## ## ### # ### ### ### #/
\r
110 * s / # ##### #### ## ## #/#
\r
111 * / ### # # ## # # # ### # # #/##
\r
112 * ------------------------------------------/# #
\r
113 * A |### # # ## # # # ### # # #|# #
\r
114 * l |# # # # # ## # #|# #
\r
115 * i |# # # # # # #|#
\r
123 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
\r
124 // private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
\r
127 * <p>Protected constructor.</p>
\r
128 * @param inputStream ICU uprop.dat file input stream
\r
129 * @exception IOException throw if data file fails authentication
\r
131 protected UConverterAliasDataReader(InputStream inputStream)
\r
132 throws IOException{
\r
133 //if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
\r
135 /*unicodeVersion = */ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
\r
137 //if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
\r
139 dataInputStream = new DataInputStream(inputStream);
\r
141 //if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
\r
144 // protected methods -------------------------------------------------
\r
146 protected int[] readToc(int n)throws IOException
\r
148 int[] toc = new int[n];
\r
150 for (int i = 0; i < n ; ++i) {
\r
151 toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
\r
156 protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{
\r
161 for(i = 0; i < convList.length; ++i)
\r
162 convList[i] = dataInputStream.readUnsignedShort();
\r
164 for(i = 0; i < tagList.length; ++i)
\r
165 tagList[i] = dataInputStream.readUnsignedShort();
\r
167 for(i = 0; i < aliasList.length; ++i)
\r
168 aliasList[i] = dataInputStream.readUnsignedShort();
\r
170 for(i = 0; i < untaggedConvArray.length; ++i)
\r
171 untaggedConvArray[i] = dataInputStream.readUnsignedShort();
\r
173 for(i = 0; i < taggedAliasArray.length; ++i)
\r
174 taggedAliasArray[i] = dataInputStream.readUnsignedShort();
\r
176 for(i = 0; i < taggedAliasLists.length; ++i)
\r
177 taggedAliasLists[i] = dataInputStream.readUnsignedShort();
\r
179 for(i = 0; i < optionTable.length; ++i)
\r
180 optionTable[i] = dataInputStream.readUnsignedShort();
\r
182 dataInputStream.readFully(stringTable);
\r
183 dataInputStream.readFully(normalizedStringTable);
\r
186 public boolean isDataVersionAcceptable(byte version[])
\r
188 return version.length >= DATA_FORMAT_VERSION.length
\r
189 && version[0] == DATA_FORMAT_VERSION[0]
\r
190 && version[1] == DATA_FORMAT_VERSION[1]
\r
191 && version[2] == DATA_FORMAT_VERSION[2];
\r
194 /*byte[] getUnicodeVersion(){
\r
195 return unicodeVersion;
\r
197 // private data members -------------------------------------------------
\r
201 * ICU data file input stream
\r
203 private DataInputStream dataInputStream;
\r
205 // private byte[] unicodeVersion;
\r
208 * File format version that this class understands.
\r
209 * No guarantees are made if a older version is used
\r
210 * see store.c of gennorm for more information and values
\r
212 // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
\r
213 private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
\r
214 private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
\r
216 //private static final int UNSIGNED_SHORT_MASK = 0xffff;
\r
217 private static final int UNSIGNED_INT_MASK = 0xffffffff;
\r