2 *******************************************************************************
\r
3 * Copyright (C) 2003-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.BufferedInputStream;
\r
10 import java.io.ByteArrayInputStream;
\r
11 import java.io.IOException;
\r
12 import java.io.InputStream;
\r
13 import java.lang.ref.WeakReference;
\r
15 import com.ibm.icu.impl.CharTrie;
\r
16 import com.ibm.icu.impl.ICUData;
\r
17 import com.ibm.icu.impl.ICUResourceBundle;
\r
18 import com.ibm.icu.impl.StringPrepDataReader;
\r
19 import com.ibm.icu.impl.UBiDiProps;
\r
20 import com.ibm.icu.lang.UCharacter;
\r
21 import com.ibm.icu.lang.UCharacterDirection;
\r
22 import com.ibm.icu.util.VersionInfo;
\r
25 * StringPrep API implements the StingPrep framework as described by
\r
26 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
\r
27 * StringPrep prepares Unicode strings for use in network protocols.
\r
28 * Profiles of StingPrep are set of rules and data according to which the
\r
29 * Unicode Strings are prepared. Each profiles contains tables which describe
\r
30 * how a code point should be treated. The tables are broadly classied into
\r
32 * <li> Unassigned Table: Contains code points that are unassigned
\r
33 * in the Unicode Version supported by StringPrep. Currently
\r
34 * RFC 3454 supports Unicode 3.2. </li>
\r
35 * <li> Prohibited Table: Contains code points that are prohibted from
\r
36 * the output of the StringPrep processing function. </li>
\r
37 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
\r
40 * The procedure for preparing Unicode strings:
\r
42 * <li> Map: For each character in the input, check if it has a mapping
\r
43 * and, if so, replace it with its mapping. </li>
\r
44 * <li> Normalize: Possibly normalize the result of step 1 using Unicode
\r
45 * normalization. </li>
\r
46 * <li> Prohibit: Check for any characters that are not allowed in the
\r
47 * output. If any are found, return an error.</li>
\r
48 * <li> Check bidi: Possibly check for right-to-left characters, and if
\r
49 * any are found, make sure that the whole string satisfies the
\r
50 * requirements for bidirectional strings. If the string does not
\r
51 * satisfy the requirements for bidirectional strings, return an
\r
54 * @author Ram Viswanadha
\r
57 public final class StringPrep {
\r
59 * Option to prohibit processing of unassigned code points in the input
\r
64 public static final int DEFAULT = 0x0000;
\r
67 * Option to allow processing of unassigned code points in the input
\r
72 public static final int ALLOW_UNASSIGNED = 0x0001;
\r
75 * Profile type: RFC3491 Nameprep
\r
76 * @see #getInstance(int)
\r
79 public static final int RFC3491_NAMEPREP = 0;
\r
82 * Profile type: RFC3530 nfs4_cs_prep
\r
83 * @see #getInstance(int)
\r
86 public static final int RFC3530_NFS4_CS_PREP = 1;
\r
89 * Profile type: RFC3530 nfs4_cs_prep with case insensitive option
\r
90 * @see #getInstance(int)
\r
93 public static final int RFC3530_NFS4_CS_PREP_CI = 2;
\r
96 * Profile type: RFC3530 nfs4_cis_prep
\r
97 * @see #getInstance(int)
\r
100 public static final int RFC3530_NFS4_CIS_PREP = 3;
\r
103 * Profile type: RFC3530 nfs4_mixed_prep for prefix
\r
104 * @see #getInstance(int)
\r
107 public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4;
\r
110 * Profile type: RFC3530 nfs4_mixed_prep for suffix
\r
111 * @see #getInstance(int)
\r
114 public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5;
\r
117 * Profile type: RFC3722 iSCSI
\r
118 * @see #getInstance(int)
\r
121 public static final int RFC3722_ISCSI = 6;
\r
124 * Profile type: RFC3920 XMPP Nodeprep
\r
125 * @see #getInstance(int)
\r
128 public static final int RFC3920_NODEPREP = 7;
\r
131 * Profile type: RFC3920 XMPP Resourceprep
\r
132 * @see #getInstance(int)
\r
135 public static final int RFC3920_RESOURCEPREP = 8;
\r
138 * Profile type: RFC4011 Policy MIB Stringprep
\r
139 * @see #getInstance(int)
\r
142 public static final int RFC4011_MIB = 9;
\r
145 * Profile type: RFC4013 SASLprep
\r
146 * @see #getInstance(int)
\r
149 public static final int RFC4013_SASLPREP = 10;
\r
152 * Profile type: RFC4505 trace
\r
153 * @see #getInstance(int)
\r
156 public static final int RFC4505_TRACE = 11;
\r
159 * Profile type: RFC4518 LDAP
\r
160 * @see #getInstance(int)
\r
163 public static final int RFC4518_LDAP = 12;
\r
166 * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix
\r
168 * @see #getInstance(int)
\r
171 public static final int RFC4518_LDAP_CI = 13;
\r
173 // Last available profile
\r
174 private static final int MAX_PROFILE = RFC4518_LDAP_CI;
\r
176 // Profile names must be aligned to profile type definitions
\r
177 private static final String[] PROFILE_NAMES = {
\r
178 "rfc3491", /* RFC3491_NAMEPREP */
\r
179 "rfc3530cs", /* RFC3530_NFS4_CS_PREP */
\r
180 "rfc3530csci", /* RFC3530_NFS4_CS_PREP_CI */
\r
181 "rfc3491", /* RFC3530_NSF4_CIS_PREP */
\r
182 "rfc3530mixp", /* RFC3530_NSF4_MIXED_PREP_PREFIX */
\r
183 "rfc3491", /* RFC3530_NSF4_MIXED_PREP_SUFFIX */
\r
184 "rfc3722", /* RFC3722_ISCSI */
\r
185 "rfc3920node", /* RFC3920_NODEPREP */
\r
186 "rfc3920res", /* RFC3920_RESOURCEPREP */
\r
187 "rfc4011", /* RFC4011_MIB */
\r
188 "rfc4013", /* RFC4013_SASLPREP */
\r
189 "rfc4505", /* RFC4505_TRACE */
\r
190 "rfc4518", /* RFC4518_LDAP */
\r
191 "rfc4518ci", /* RFC4518_LDAP_CI */
\r
194 @SuppressWarnings("unchecked")
\r
195 private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1];
\r
197 private static final int UNASSIGNED = 0x0000;
\r
198 private static final int MAP = 0x0001;
\r
199 private static final int PROHIBITED = 0x0002;
\r
200 private static final int DELETE = 0x0003;
\r
201 private static final int TYPE_LIMIT = 0x0004;
\r
203 private static final int NORMALIZATION_ON = 0x0001;
\r
204 private static final int CHECK_BIDI_ON = 0x0002;
\r
206 private static final int TYPE_THRESHOLD = 0xFFF0;
\r
207 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
\r
208 //private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
\r
210 /* indexes[] value names */
\r
211 private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
\r
212 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
\r
213 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
\r
214 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
\r
215 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
\r
216 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
\r
217 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
\r
218 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
\r
219 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
\r
223 * Default buffer size of datafile
\r
225 private static final int DATA_BUFFER_SIZE = 25000;
\r
227 // CharTrie implmentation for reading the trie data
\r
228 private CharTrie sprepTrie;
\r
229 // Indexes read from the data file
\r
230 private int[] indexes;
\r
231 // mapping data read from the data file
\r
232 private char[] mappingData;
\r
233 // format version of the data file
\r
234 //private byte[] formatVersion;
\r
235 // the version of Unicode supported by the data file
\r
236 private VersionInfo sprepUniVer;
\r
237 // the Unicode version of last entry in the
\r
238 // NormalizationCorrections.txt file if normalization
\r
240 private VersionInfo normCorrVer;
\r
241 // Option to turn on Normalization
\r
242 private boolean doNFKC;
\r
243 // Option to turn on checking for BiDi rules
\r
244 private boolean checkBiDi;
\r
246 private UBiDiProps bdp;
\r
248 private char getCodePointValue(int ch){
\r
249 return sprepTrie.getCodePointValue(ch);
\r
252 private static VersionInfo getVersionInfo(int comp){
\r
253 int micro = comp & 0xFF;
\r
254 int milli =(comp >> 8) & 0xFF;
\r
255 int minor =(comp >> 16) & 0xFF;
\r
256 int major =(comp >> 24) & 0xFF;
\r
257 return VersionInfo.getInstance(major,minor,milli,micro);
\r
259 private static VersionInfo getVersionInfo(byte[] version){
\r
260 if(version.length != 4){
\r
263 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
\r
266 * Creates an StringPrep object after reading the input stream.
\r
267 * The object does not hold a reference to the input steam, so the stream can be
\r
268 * closed after the method returns.
\r
270 * @param inputStream The stream for reading the StringPrep profile binarySun
\r
271 * @throws IOException An exception occurs when I/O of the inputstream is invalid
\r
274 public StringPrep(InputStream inputStream) throws IOException{
\r
276 BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
\r
278 StringPrepDataReader reader = new StringPrepDataReader(b);
\r
280 // read the indexes
\r
281 indexes = reader.readIndexes(INDEX_TOP);
\r
283 byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
\r
286 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
\r
287 mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
\r
288 // load the rest of the data data and initialize the data members
\r
289 reader.read(sprepBytes,mappingData);
\r
291 sprepTrie = new CharTrie(new ByteArrayInputStream(sprepBytes), null);
\r
293 // get the data format version
\r
294 /*formatVersion = */reader.getDataFormatVersion();
\r
297 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
\r
298 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
\r
299 sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
\r
300 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
\r
301 VersionInfo normUniVer = UCharacter.getUnicodeVersion();
\r
302 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
\r
303 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
\r
304 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
\r
306 throw new IOException("Normalization Correction version not supported");
\r
311 bdp=UBiDiProps.getSingleton();
\r
316 * Gets a StringPrep instance for the specified profile
\r
318 * @param profile The profile passed to find the StringPrep instance.
\r
321 public static StringPrep getInstance(int profile) {
\r
322 if (profile < 0 || profile > MAX_PROFILE) {
\r
323 throw new IllegalArgumentException("Bad profile type");
\r
326 StringPrep instance = null;
\r
328 // A StringPrep instance is immutable. We use a single instance
\r
329 // per type and store it in the internal cache.
\r
330 synchronized (CACHE) {
\r
331 WeakReference<StringPrep> ref = CACHE[profile];
\r
333 instance = ref.get();
\r
336 if (instance == null) {
\r
337 InputStream stream = ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE + "/"
\r
338 + PROFILE_NAMES[profile] + ".spp");
\r
339 if (stream != null) {
\r
342 instance = new StringPrep(stream);
\r
346 } catch (IOException e) {
\r
347 throw new RuntimeException(e.toString());
\r
350 if (instance != null) {
\r
351 CACHE[profile] = new WeakReference<StringPrep>(instance);
\r
358 private static final class Values{
\r
362 public void reset(){
\r
369 private static final void getValues(char trieWord,Values values){
\r
373 * Initial value stored in the mapping table
\r
374 * just return TYPE_LIMIT .. so that
\r
375 * the source codepoint is copied to the destination
\r
377 values.type = TYPE_LIMIT;
\r
378 }else if(trieWord >= TYPE_THRESHOLD){
\r
379 values.type = (trieWord - TYPE_THRESHOLD);
\r
383 /* ascertain if the value is index or delta */
\r
384 if((trieWord & 0x02)>0){
\r
385 values.isIndex = true;
\r
386 values.value = trieWord >> 2; //mask off the lower 2 bits and shift
\r
389 values.isIndex = false;
\r
390 values.value = (trieWord<<16)>>16;
\r
391 values.value = (values.value >> 2);
\r
395 if((trieWord>>2) == MAX_INDEX_VALUE){
\r
396 values.type = DELETE;
\r
397 values.isIndex = false;
\r
405 private StringBuffer map( UCharacterIterator iter, int options)
\r
406 throws StringPrepParseException{
\r
408 Values val = new Values();
\r
410 int ch = UCharacterIterator.DONE;
\r
411 StringBuffer dest = new StringBuffer();
\r
412 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
\r
414 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
\r
416 result = getCodePointValue(ch);
\r
417 getValues(result,val);
\r
419 // check if the source codepoint is unassigned
\r
420 if(val.type == UNASSIGNED && allowUnassigned == false){
\r
421 throw new StringPrepParseException("An unassigned code point was found in the input",
\r
422 StringPrepParseException.UNASSIGNED_ERROR,
\r
423 iter.getText(),iter.getIndex());
\r
424 }else if((val.type == MAP)){
\r
429 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
\r
430 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
\r
432 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
\r
433 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
\r
435 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
\r
436 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
\r
439 length = mappingData[index++];
\r
441 /* copy mapping to destination */
\r
442 dest.append(mappingData,index,length);
\r
448 }else if(val.type == DELETE){
\r
449 // just consume the codepoint and contine
\r
452 //copy the source into destination
\r
453 UTF16.append(dest,ch);
\r
460 private StringBuffer normalize(StringBuffer src){
\r
461 return new StringBuffer(
\r
462 Normalizer.normalize(
\r
465 Normalizer.UNICODE_3_2));
\r
468 boolean isLabelSeparator(int ch){
\r
469 int result = getCodePointValue(ch);
\r
470 if( (result & 0x07) == LABEL_SEPARATOR){
\r
477 1) Map -- For each character in the input, check if it has a mapping
\r
478 and, if so, replace it with its mapping.
\r
480 2) Normalize -- Possibly normalize the result of step 1 using Unicode
\r
483 3) Prohibit -- Check for any characters that are not allowed in the
\r
484 output. If any are found, return an error.
\r
486 4) Check bidi -- Possibly check for right-to-left characters, and if
\r
487 any are found, make sure that the whole string satisfies the
\r
488 requirements for bidirectional strings. If the string does not
\r
489 satisfy the requirements for bidirectional strings, return an
\r
491 [Unicode3.2] defines several bidirectional categories; each character
\r
492 has one bidirectional category assigned to it. For the purposes of
\r
493 the requirements below, an "RandALCat character" is a character that
\r
494 has Unicode bidirectional categories "R" or "AL"; an "LCat character"
\r
495 is a character that has Unicode bidirectional category "L". Note
\r
498 that there are many characters which fall in neither of the above
\r
499 definitions; Latin digits (<U+0030> through <U+0039>) are examples of
\r
500 this because they have bidirectional category "EN".
\r
502 In any profile that specifies bidirectional character handling, all
\r
503 three of the following requirements MUST be met:
\r
505 1) The characters in section 5.8 MUST be prohibited.
\r
507 2) If a string contains any RandALCat character, the string MUST NOT
\r
508 contain any LCat character.
\r
510 3) If a string contains any RandALCat character, a RandALCat
\r
511 character MUST be the first character of the string, and a
\r
512 RandALCat character MUST be the last character of the string.
\r
515 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
\r
516 * checks for prohibited and BiDi characters in the order defined by RFC 3454
\r
517 * depending on the options specified in the profile.
\r
519 * @param src A UCharacterIterator object containing the source string
\r
520 * @param options A bit set of options:
\r
522 * - StringPrep.NONE Prohibit processing of unassigned code points in the input
\r
524 * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
\r
525 * as normal Unicode code points.
\r
527 * @return StringBuffer A StringBuffer containing the output
\r
528 * @throws StringPrepParseException An exception occurs when parsing a string is invalid.
\r
531 public StringBuffer prepare(UCharacterIterator src, int options)
\r
532 throws StringPrepParseException{
\r
535 StringBuffer mapOut = map(src,options);
\r
536 StringBuffer normOut = mapOut;// initialize
\r
540 normOut = normalize(mapOut);
\r
545 UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
\r
546 Values val = new Values();
\r
547 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
\r
548 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
\r
549 int rtlPos=-1, ltrPos=-1;
\r
550 boolean rightToLeft=false, leftToRight=false;
\r
552 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
\r
553 result = getCodePointValue(ch);
\r
554 getValues(result,val);
\r
556 if(val.type == PROHIBITED ){
\r
557 throw new StringPrepParseException("A prohibited code point was found in the input",
\r
558 StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value);
\r
562 direction = bdp.getClass(ch);
\r
563 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
\r
564 firstCharDir = direction;
\r
566 if(direction == UCharacterDirection.LEFT_TO_RIGHT){
\r
567 leftToRight = true;
\r
568 ltrPos = iter.getIndex()-1;
\r
570 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
\r
571 rightToLeft = true;
\r
572 rtlPos = iter.getIndex()-1;
\r
576 if(checkBiDi == true){
\r
578 if( leftToRight == true && rightToLeft == true){
\r
579 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
\r
580 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
\r
581 (rtlPos>ltrPos) ? rtlPos : ltrPos);
\r
585 if( rightToLeft == true &&
\r
586 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
\r
587 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
\r
589 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
\r
590 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
\r
591 (rtlPos>ltrPos) ? rtlPos : ltrPos);
\r
599 * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
\r
600 * checks for prohibited and BiDi characters in the order defined by RFC 3454
\r
601 * depending on the options specified in the profile.
\r
603 * @param src A string
\r
604 * @param options A bit set of options:
\r
606 * - StringPrep.NONE Prohibit processing of unassigned code points in the input
\r
608 * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
\r
609 * as normal Unicode code points.
\r
611 * @return String A String containing the output
\r
612 * @throws StringPrepParseException An exception when parsing or preparing a string is invalid.
\r
615 public String prepare(String src, int options)
\r
616 throws StringPrepParseException{
\r
617 StringBuffer result = prepare(UCharacterIterator.getInstance(src), options);
\r
618 return result.toString();
\r