2 *******************************************************************************
\r
3 * Copyright (C) 2006-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.charset;
\r
9 import java.io.DataInputStream;
\r
10 import java.io.IOException;
\r
11 import java.io.InputStream;
\r
13 import com.ibm.icu.impl.ICUBinary;
\r
16 /* Format of cnvalias.icu -----------------------------------------------------
\r
18 * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
\r
19 * This binary form contains several tables. All indexes are to uint16_t
\r
20 * units, and not to the bytes (uint8_t units). Addressing everything on
\r
21 * 16-bit boundaries allows us to store more information with small index
\r
22 * numbers, which are also 16-bit in size. The majority of the table (except
\r
23 * the string table) are 16-bit numbers.
\r
25 * First there is the size of the Table of Contents (TOC). The TOC
\r
26 * entries contain the size of each section. In order to find the offset
\r
27 * you just need to sum up the previous offsets.
\r
28 * The TOC length and entries are an array of uint32_t values.
\r
29 * The first section after the TOC starts immediately after the TOC.
\r
31 * 1) This section contains a list of converters. This list contains indexes
\r
32 * into the string table for the converter name. The index of this list is
\r
33 * also used by other sections, which are mentioned later on.
\r
34 * This list is not sorted.
\r
36 * 2) This section contains a list of tags. This list contains indexes
\r
37 * into the string table for the tag name. The index of this list is
\r
38 * also used by other sections, which are mentioned later on.
\r
39 * This list is in priority order of standards.
\r
41 * 3) This section contains a list of sorted unique aliases. This
\r
42 * list contains indexes into the string table for the alias name. The
\r
43 * index of this list is also used by other sections, like the 4th section.
\r
44 * The index for the 3rd and 4th section is used to get the
\r
45 * alias -> converter name mapping. Section 3 and 4 form a two column table.
\r
47 * 4) This section contains a list of mapped converter names. Consider this
\r
48 * as a table that maps the 3rd section to the 1st section. This list contains
\r
49 * indexes into the 1st section. The index of this list is the same index in
\r
50 * the 3rd section. There is also some extra information in the high bits of
\r
51 * each converter index in this table. Currently it's only used to say that
\r
52 * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
\r
53 * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
\r
54 * the predigested form of the 5th section so that an alias lookup can be fast.
\r
56 * 5) This section contains a 2D array with indexes to the 6th section. This
\r
57 * section is the full form of all alias mappings. The column index is the
\r
58 * index into the converter list (column header). The row index is the index
\r
59 * to tag list (row header). This 2D array is the top part a 3D array. The
\r
60 * third dimension is in the 6th section.
\r
62 * 6) This is blob of variable length arrays. Each array starts with a size,
\r
63 * and is followed by indexes to alias names in the string table. This is
\r
64 * the third dimension to the section 5. No other section should be referencing
\r
67 * 7) Reserved at this time (There is no information). This _usually_ has a
\r
68 * size of 0. Future versions may add more information here.
\r
70 * 8) This is the string table. All strings are indexed on an even address.
\r
71 * There are two reasons for this. First many chip architectures locate strings
\r
72 * faster on even address boundaries. Second, since all indexes are 16-bit
\r
73 * numbers, this string table can be 128KB in size instead of 64KB when we
\r
74 * only have strings starting on an even address.
\r
77 * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
\r
78 * has a unique alias among all converters. That same alias can
\r
79 * be mentioned in other standards on different converters,
\r
80 * but only one alias per tag can be unique.
\r
83 * Converter Names (Usually in TR22 form)
\r
84 * -------------------------------------------.
\r
90 * ------------------------------------------/ |
\r
98 * -------------------------------------------
\r
102 * Here is what it really looks like. It's like swiss cheese.
\r
103 * There are holes. Some converters aren't recognized by
\r
104 * a standard, or they are really old converters that the
\r
105 * standard doesn't recognize anymore.
\r
107 * Converter Names (Usually in TR22 form)
\r
108 * -------------------------------------------.
\r
109 * T /##########################################/|
\r
111 * g / # ## ## ### # ### ### ### #/
\r
112 * s / # ##### #### ## ## #/#
\r
113 * / ### # # ## # # # ### # # #/##
\r
114 * ------------------------------------------/# #
\r
115 * A |### # # ## # # # ### # # #|# #
\r
116 * l |# # # # # ## # #|# #
\r
117 * i |# # # # # # #|#
\r
125 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
\r
126 // private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
\r
129 * <p>Protected constructor.</p>
\r
130 * @param inputStream ICU uprop.dat file input stream
\r
131 * @exception IOException throw if data file fails authentication
\r
133 protected UConverterAliasDataReader(InputStream inputStream)
\r
134 throws IOException{
\r
135 //if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
\r
137 /*unicodeVersion = */ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
\r
139 //if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
\r
141 dataInputStream = new DataInputStream(inputStream);
\r
143 //if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
\r
146 // protected methods -------------------------------------------------
\r
148 protected int[] readToc(int n)throws IOException
\r
150 int[] toc = new int[n];
\r
152 for (int i = 0; i < n ; ++i) {
\r
153 toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK;
\r
158 protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{
\r
163 for(i = 0; i < convList.length; ++i)
\r
164 convList[i] = dataInputStream.readUnsignedShort();
\r
166 for(i = 0; i < tagList.length; ++i)
\r
167 tagList[i] = dataInputStream.readUnsignedShort();
\r
169 for(i = 0; i < aliasList.length; ++i)
\r
170 aliasList[i] = dataInputStream.readUnsignedShort();
\r
172 for(i = 0; i < untaggedConvArray.length; ++i)
\r
173 untaggedConvArray[i] = dataInputStream.readUnsignedShort();
\r
175 for(i = 0; i < taggedAliasArray.length; ++i)
\r
176 taggedAliasArray[i] = dataInputStream.readUnsignedShort();
\r
178 for(i = 0; i < taggedAliasLists.length; ++i)
\r
179 taggedAliasLists[i] = dataInputStream.readUnsignedShort();
\r
181 for(i = 0; i < optionTable.length; ++i)
\r
182 optionTable[i] = dataInputStream.readUnsignedShort();
\r
184 dataInputStream.readFully(stringTable);
\r
185 dataInputStream.readFully(normalizedStringTable);
\r
188 public boolean isDataVersionAcceptable(byte version[])
\r
190 return version.length >= DATA_FORMAT_VERSION.length
\r
191 && version[0] == DATA_FORMAT_VERSION[0]
\r
192 && version[1] == DATA_FORMAT_VERSION[1]
\r
193 && version[2] == DATA_FORMAT_VERSION[2];
\r
196 /*byte[] getUnicodeVersion(){
\r
197 return unicodeVersion;
\r
199 // private data members -------------------------------------------------
\r
203 * ICU data file input stream
\r
205 private DataInputStream dataInputStream;
\r
207 // private byte[] unicodeVersion;
\r
210 * File format version that this class understands.
\r
211 * No guarantees are made if a older version is used
\r
212 * see store.c of gennorm for more information and values
\r
214 // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
\r
215 private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl"
\r
216 private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
\r
218 //private static final int UNSIGNED_SHORT_MASK = 0xffff;
\r
219 private static final int UNSIGNED_INT_MASK = 0xffffffff;
\r