/* ******************************************************************************* * Copyright (C) 2006-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.charset; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import com.ibm.icu.impl.ICUBinary; /* Format of cnvalias.icu ----------------------------------------------------- * * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. * This binary form contains several tables. All indexes are to uint16_t * units, and not to the bytes (uint8_t units). Addressing everything on * 16-bit boundaries allows us to store more information with small index * numbers, which are also 16-bit in size. The majority of the table (except * the string table) are 16-bit numbers. * * First there is the size of the Table of Contents (TOC). The TOC * entries contain the size of each section. In order to find the offset * you just need to sum up the previous offsets. * The TOC length and entries are an array of uint32_t values. * The first section after the TOC starts immediately after the TOC. * * 1) This section contains a list of converters. This list contains indexes * into the string table for the converter name. The index of this list is * also used by other sections, which are mentioned later on. * This list is not sorted. * * 2) This section contains a list of tags. This list contains indexes * into the string table for the tag name. The index of this list is * also used by other sections, which are mentioned later on. * This list is in priority order of standards. * * 3) This section contains a list of sorted unique aliases. This * list contains indexes into the string table for the alias name. The * index of this list is also used by other sections, like the 4th section. * The index for the 3rd and 4th section is used to get the * alias -> converter name mapping. Section 3 and 4 form a two column table. * * 4) This section contains a list of mapped converter names. Consider this * as a table that maps the 3rd section to the 1st section. This list contains * indexes into the 1st section. The index of this list is the same index in * the 3rd section. There is also some extra information in the high bits of * each converter index in this table. Currently it's only used to say that * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is * the predigested form of the 5th section so that an alias lookup can be fast. * * 5) This section contains a 2D array with indexes to the 6th section. This * section is the full form of all alias mappings. The column index is the * index into the converter list (column header). The row index is the index * to tag list (row header). This 2D array is the top part a 3D array. The * third dimension is in the 6th section. * * 6) This is blob of variable length arrays. Each array starts with a size, * and is followed by indexes to alias names in the string table. This is * the third dimension to the section 5. No other section should be referencing * this section. * * 7) Reserved at this time (There is no information). This _usually_ has a * size of 0. Future versions may add more information here. * * 8) This is the string table. All strings are indexed on an even address. * There are two reasons for this. First many chip architectures locate strings * faster on even address boundaries. Second, since all indexes are 16-bit * numbers, this string table can be 128KB in size instead of 64KB when we * only have strings starting on an even address. * * * Here is the concept of section 5 and 6. It's a 3D cube. Each tag * has a unique alias among all converters. That same alias can * be mentioned in other standards on different converters, * but only one alias per tag can be unique. * * * Converter Names (Usually in TR22 form) * -------------------------------------------. * T / /| * a / / | * g / / | * s / / | * / / | * ------------------------------------------/ | * A | | | * l | | | * i | | / * a | | / * s | | / * e | | / * s | |/ * ------------------------------------------- * * * * Here is what it really looks like. It's like swiss cheese. * There are holes. Some converters aren't recognized by * a standard, or they are really old converters that the * standard doesn't recognize anymore. * * Converter Names (Usually in TR22 form) * -------------------------------------------. * T /##########################################/| * a / # # /# * g / # ## ## ### # ### ### ### #/ * s / # ##### #### ## ## #/# * / ### # # ## # # # ### # # #/## * ------------------------------------------/# # * A |### # # ## # # # ### # # #|# # * l |# # # # # ## # #|# # * i |# # # # # # #|# * a |# #|# * s | #|# * e * s * */ final class UConverterAliasDataReader implements ICUBinary.Authenticate { // private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader"); /** *

Protected constructor.

* @param inputStream ICU uprop.dat file input stream * @exception IOException throw if data file fails authentication */ protected UConverterAliasDataReader(InputStream inputStream) throws IOException{ //if(debug) System.out.println("Bytes in inputStream " + inputStream.available()); /*unicodeVersion = */ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); //if(debug) System.out.println("Bytes left in inputStream " +inputStream.available()); dataInputStream = new DataInputStream(inputStream); //if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available()); } // protected methods ------------------------------------------------- protected int[] readToc(int n)throws IOException { int[] toc = new int[n]; //Read the toc for (int i = 0; i < n ; ++i) { toc[i] = dataInputStream.readInt() & UNSIGNED_INT_MASK; } return toc; } protected void read(int[] convList, int[] tagList, int[] aliasList, int[]untaggedConvArray, int[] taggedAliasArray, int[] taggedAliasLists, int[] optionTable, byte[] stringTable, byte[] normalizedStringTable) throws IOException{ int i; //int listnum = 1; //long listsize; for(i = 0; i < convList.length; ++i) convList[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < tagList.length; ++i) tagList[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < aliasList.length; ++i) aliasList[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < untaggedConvArray.length; ++i) untaggedConvArray[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < taggedAliasArray.length; ++i) taggedAliasArray[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < taggedAliasLists.length; ++i) taggedAliasLists[i] = dataInputStream.readUnsignedShort(); for(i = 0; i < optionTable.length; ++i) optionTable[i] = dataInputStream.readUnsignedShort(); dataInputStream.readFully(stringTable); dataInputStream.readFully(normalizedStringTable); } public boolean isDataVersionAcceptable(byte version[]) { return version.length >= DATA_FORMAT_VERSION.length && version[0] == DATA_FORMAT_VERSION[0] && version[1] == DATA_FORMAT_VERSION[1] && version[2] == DATA_FORMAT_VERSION[2]; } /*byte[] getUnicodeVersion(){ return unicodeVersion; }*/ // private data members ------------------------------------------------- /** * ICU data file input stream */ private DataInputStream dataInputStream; // private byte[] unicodeVersion; /** * File format version that this class understands. * No guarantees are made if a older version is used * see store.c of gennorm for more information and values */ // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c) private static final byte DATA_FORMAT_ID[] = {(byte)0x43, (byte)0x76, (byte)0x41, (byte)0x6c}; // dataFormat="CvAl" private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1}; //private static final int UNSIGNED_SHORT_MASK = 0xffff; private static final int UNSIGNED_INT_MASK = 0xffffffff; }