2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import java.io.IOException;
\r
10 import java.util.ArrayList;
\r
11 import java.util.regex.Pattern;
\r
13 import com.ibm.icu.lang.UCharacter;
\r
14 import com.ibm.icu.text.Replaceable;
\r
15 import com.ibm.icu.text.UTF16;
\r
16 import com.ibm.icu.text.UnicodeMatcher;
\r
18 public final class Utility {
\r
20 private static final char APOSTROPHE = '\'';
\r
21 private static final char BACKSLASH = '\\';
\r
22 private static final int MAGIC_UNSIGNED = 0x80000000;
\r
25 * Convenience utility to compare two Object[]s.
\r
26 * Ought to be in System
\r
28 public final static boolean arrayEquals(Object[] source, Object target) {
\r
29 if (source == null) return (target == null);
\r
30 if (!(target instanceof Object[])) return false;
\r
31 Object[] targ = (Object[]) target;
\r
32 return (source.length == targ.length
\r
33 && arrayRegionMatches(source, 0, targ, 0, source.length));
\r
37 * Convenience utility to compare two int[]s
\r
38 * Ought to be in System
\r
40 public final static boolean arrayEquals(int[] source, Object target) {
\r
41 if (source == null) return (target == null);
\r
42 if (!(target instanceof int[])) return false;
\r
43 int[] targ = (int[]) target;
\r
44 return (source.length == targ.length
\r
45 && arrayRegionMatches(source, 0, targ, 0, source.length));
\r
49 * Convenience utility to compare two double[]s
\r
50 * Ought to be in System
\r
52 public final static boolean arrayEquals(double[] source, Object target) {
\r
53 if (source == null) return (target == null);
\r
54 if (!(target instanceof double[])) return false;
\r
55 double[] targ = (double[]) target;
\r
56 return (source.length == targ.length
\r
57 && arrayRegionMatches(source, 0, targ, 0, source.length));
\r
59 public final static boolean arrayEquals(byte[] source, Object target) {
\r
60 if (source == null) return (target == null);
\r
61 if (!(target instanceof byte[])) return false;
\r
62 byte[] targ = (byte[]) target;
\r
63 return (source.length == targ.length
\r
64 && arrayRegionMatches(source, 0, targ, 0, source.length));
\r
68 * Convenience utility to compare two Object[]s
\r
69 * Ought to be in System
\r
71 public final static boolean arrayEquals(Object source, Object target) {
\r
72 if (source == null) return (target == null);
\r
73 // for some reason, the correct arrayEquals is not being called
\r
74 // so do it by hand for now.
\r
75 if (source instanceof Object[])
\r
76 return(arrayEquals((Object[]) source,target));
\r
77 if (source instanceof int[])
\r
78 return(arrayEquals((int[]) source,target));
\r
79 if (source instanceof double[])
\r
80 return(arrayEquals((int[]) source,target));
\r
81 if (source instanceof byte[])
\r
82 return(arrayEquals((byte[]) source,target));
\r
83 return source.equals(target);
\r
87 * Convenience utility to compare two Object[]s
\r
88 * Ought to be in System.
\r
89 * @param len the length to compare.
\r
90 * The start indices and start+len must be valid.
\r
92 public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
\r
93 Object[] target, int targetStart,
\r
96 int sourceEnd = sourceStart + len;
\r
97 int delta = targetStart - sourceStart;
\r
98 for (int i = sourceStart; i < sourceEnd; i++) {
\r
99 if (!arrayEquals(source[i],target[i + delta]))
\r
106 * Convenience utility to compare two Object[]s
\r
107 * Ought to be in System.
\r
108 * @param len the length to compare.
\r
109 * The start indices and start+len must be valid.
\r
111 public final static boolean arrayRegionMatches(char[] source, int sourceStart,
\r
112 char[] target, int targetStart,
\r
115 int sourceEnd = sourceStart + len;
\r
116 int delta = targetStart - sourceStart;
\r
117 for (int i = sourceStart; i < sourceEnd; i++) {
\r
118 if (source[i]!=target[i + delta])
\r
125 * Convenience utility to compare two int[]s.
\r
126 * @param len the length to compare.
\r
127 * The start indices and start+len must be valid.
\r
128 * Ought to be in System
\r
130 public final static boolean arrayRegionMatches(int[] source, int sourceStart,
\r
131 int[] target, int targetStart,
\r
134 int sourceEnd = sourceStart + len;
\r
135 int delta = targetStart - sourceStart;
\r
136 for (int i = sourceStart; i < sourceEnd; i++) {
\r
137 if (source[i] != target[i + delta])
\r
144 * Convenience utility to compare two arrays of doubles.
\r
145 * @param len the length to compare.
\r
146 * The start indices and start+len must be valid.
\r
147 * Ought to be in System
\r
149 public final static boolean arrayRegionMatches(double[] source, int sourceStart,
\r
150 double[] target, int targetStart,
\r
153 int sourceEnd = sourceStart + len;
\r
154 int delta = targetStart - sourceStart;
\r
155 for (int i = sourceStart; i < sourceEnd; i++) {
\r
156 if (source[i] != target[i + delta])
\r
161 public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
\r
162 byte[] target, int targetStart, int len){
\r
163 int sourceEnd = sourceStart + len;
\r
164 int delta = targetStart - sourceStart;
\r
165 for (int i = sourceStart; i < sourceEnd; i++) {
\r
166 if (source[i] != target[i + delta])
\r
173 * Convenience utility. Does null checks on objects, then calls equals.
\r
175 public final static boolean objectEquals(Object a, Object b) {
\r
176 return a == null ?
\r
177 b == null ? true : false :
\r
178 b == null ? false : a.equals(b);
\r
182 * Convenience utility. Does null checks on objects, then calls compare.
\r
184 public static <T extends Comparable<T>> int checkCompare(T a, T b) {
\r
185 return a == null ?
\r
186 b == null ? 0 : -1 :
\r
187 b == null ? 1 : a.compareTo(b);
\r
191 * Convenience utility. Does null checks on object, then calls hashCode.
\r
193 public static int checkHash(Object a) {
\r
194 return a == null ? 0 : a.hashCode();
\r
198 * The ESCAPE character is used during run-length encoding. It signals
\r
199 * a run of identical chars.
\r
201 private static final char ESCAPE = '\uA5A5';
\r
204 * The ESCAPE_BYTE character is used during run-length encoding. It signals
\r
205 * a run of identical bytes.
\r
207 static final byte ESCAPE_BYTE = (byte)0xA5;
\r
210 * Construct a string representing an int array. Use run-length encoding.
\r
211 * A character represents itself, unless it is the ESCAPE character. Then
\r
212 * the following notations are possible:
\r
213 * ESCAPE ESCAPE ESCAPE literal
\r
214 * ESCAPE n c n instances of character c
\r
215 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
\r
216 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
\r
217 * If we encounter a run where n == ESCAPE, we represent this as:
\r
219 * The ESCAPE value is chosen so as not to collide with commonly
\r
222 static public final String arrayToRLEString(int[] a) {
\r
223 StringBuilder buffer = new StringBuilder();
\r
225 appendInt(buffer, a.length);
\r
226 int runValue = a[0];
\r
228 for (int i=1; i<a.length; ++i) {
\r
230 if (s == runValue && runLength < 0xFFFF) {
\r
233 encodeRun(buffer, runValue, runLength);
\r
238 encodeRun(buffer, runValue, runLength);
\r
239 return buffer.toString();
\r
243 * Construct a string representing a short array. Use run-length encoding.
\r
244 * A character represents itself, unless it is the ESCAPE character. Then
\r
245 * the following notations are possible:
\r
246 * ESCAPE ESCAPE ESCAPE literal
\r
247 * ESCAPE n c n instances of character c
\r
248 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
\r
249 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
\r
250 * If we encounter a run where n == ESCAPE, we represent this as:
\r
252 * The ESCAPE value is chosen so as not to collide with commonly
\r
255 static public final String arrayToRLEString(short[] a) {
\r
256 StringBuilder buffer = new StringBuilder();
\r
257 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
\r
258 buffer.append((char) (a.length >> 16));
\r
259 buffer.append((char) a.length);
\r
260 short runValue = a[0];
\r
262 for (int i=1; i<a.length; ++i) {
\r
264 if (s == runValue && runLength < 0xFFFF) ++runLength;
\r
266 encodeRun(buffer, runValue, runLength);
\r
271 encodeRun(buffer, runValue, runLength);
\r
272 return buffer.toString();
\r
276 * Construct a string representing a char array. Use run-length encoding.
\r
277 * A character represents itself, unless it is the ESCAPE character. Then
\r
278 * the following notations are possible:
\r
279 * ESCAPE ESCAPE ESCAPE literal
\r
280 * ESCAPE n c n instances of character c
\r
281 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
\r
282 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
\r
283 * If we encounter a run where n == ESCAPE, we represent this as:
\r
285 * The ESCAPE value is chosen so as not to collide with commonly
\r
288 static public final String arrayToRLEString(char[] a) {
\r
289 StringBuilder buffer = new StringBuilder();
\r
290 buffer.append((char) (a.length >> 16));
\r
291 buffer.append((char) a.length);
\r
292 char runValue = a[0];
\r
294 for (int i=1; i<a.length; ++i) {
\r
296 if (s == runValue && runLength < 0xFFFF) ++runLength;
\r
298 encodeRun(buffer, (short)runValue, runLength);
\r
303 encodeRun(buffer, (short)runValue, runLength);
\r
304 return buffer.toString();
\r
308 * Construct a string representing a byte array. Use run-length encoding.
\r
309 * Two bytes are packed into a single char, with a single extra zero byte at
\r
310 * the end if needed. A byte represents itself, unless it is the
\r
311 * ESCAPE_BYTE. Then the following notations are possible:
\r
312 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal
\r
313 * ESCAPE_BYTE n b n instances of byte b
\r
314 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
\r
315 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
\r
316 * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
\r
317 * b ESCAPE_BYTE n-1 b
\r
318 * The ESCAPE_BYTE value is chosen so as not to collide with commonly
\r
321 static public final String arrayToRLEString(byte[] a) {
\r
322 StringBuilder buffer = new StringBuilder();
\r
323 buffer.append((char) (a.length >> 16));
\r
324 buffer.append((char) a.length);
\r
325 byte runValue = a[0];
\r
327 byte[] state = new byte[2];
\r
328 for (int i=1; i<a.length; ++i) {
\r
330 if (b == runValue && runLength < 0xFF) ++runLength;
\r
332 encodeRun(buffer, runValue, runLength, state);
\r
337 encodeRun(buffer, runValue, runLength, state);
\r
339 // We must save the final byte, if there is one, by padding
\r
341 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
\r
343 return buffer.toString();
\r
347 * Encode a run, possibly a degenerate run (of < 4 values).
\r
348 * @param length The length of the run; must be > 0 && <= 0xFFFF.
\r
350 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
\r
352 for (int j=0; j<length; ++j) {
\r
353 if (value == ESCAPE) {
\r
354 appendInt(buffer, value);
\r
356 appendInt(buffer, value);
\r
360 if (length == (int) ESCAPE) {
\r
361 if (value == (int) ESCAPE) {
\r
362 appendInt(buffer, ESCAPE);
\r
364 appendInt(buffer, value);
\r
367 appendInt(buffer, ESCAPE);
\r
368 appendInt(buffer, length);
\r
369 appendInt(buffer, value); // Don't need to escape this value
\r
373 private static final <T extends Appendable> void appendInt(T buffer, int value) {
\r
375 buffer.append((char)(value >>> 16));
\r
376 buffer.append((char)(value & 0xFFFF));
\r
377 } catch (IOException e) {
\r
378 throw new IllegalIcuArgumentException(e);
\r
383 * Encode a run, possibly a degenerate run (of < 4 values).
\r
384 * @param length The length of the run; must be > 0 && <= 0xFFFF.
\r
386 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
\r
389 for (int j=0; j<length; ++j) {
\r
390 if (value == (int) ESCAPE)
\r
391 buffer.append(ESCAPE);
\r
392 buffer.append((char) value);
\r
396 if (length == (int) ESCAPE) {
\r
397 if (value == (int) ESCAPE) buffer.append(ESCAPE);
\r
398 buffer.append((char) value);
\r
401 buffer.append(ESCAPE);
\r
402 buffer.append((char) length);
\r
403 buffer.append((char) value); // Don't need to escape this value
\r
405 } catch (IOException e) {
\r
406 throw new IllegalIcuArgumentException(e);
\r
411 * Encode a run, possibly a degenerate run (of < 4 values).
\r
412 * @param length The length of the run; must be > 0 && <= 0xFF.
\r
414 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
\r
417 for (int j=0; j<length; ++j) {
\r
418 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
\r
419 appendEncodedByte(buffer, value, state);
\r
423 if (length == ESCAPE_BYTE) {
\r
424 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
\r
425 appendEncodedByte(buffer, value, state);
\r
428 appendEncodedByte(buffer, ESCAPE_BYTE, state);
\r
429 appendEncodedByte(buffer, (byte)length, state);
\r
430 appendEncodedByte(buffer, value, state); // Don't need to escape this value
\r
435 * Append a byte to the given Appendable, packing two bytes into each
\r
436 * character. The state parameter maintains intermediary data between
\r
438 * @param state A two-element array, with state[0] == 0 if this is the
\r
439 * first byte of a pair, or state[0] != 0 if this is the second byte
\r
440 * of a pair, in which case state[1] is the first byte.
\r
442 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
\r
445 if (state[0] != 0) {
\r
446 char c = (char) ((state[1] << 8) | (((int) value) & 0xFF));
\r
454 } catch (IOException e) {
\r
455 throw new IllegalIcuArgumentException(e);
\r
460 * Construct an array of ints from a run-length encoded string.
\r
462 static public final int[] RLEStringToIntArray(String s) {
\r
463 int length = getInt(s, 0);
\r
464 int[] array = new int[length];
\r
467 int maxI = s.length() / 2;
\r
468 while (ai < length && i < maxI) {
\r
469 int c = getInt(s, i++);
\r
472 c = getInt(s, i++);
\r
477 int runValue = getInt(s, i++);
\r
478 for (int j=0; j<runLength; ++j) {
\r
479 array[ai++] = runValue;
\r
488 if (ai != length || i != maxI) {
\r
489 throw new IllegalStateException("Bad run-length encoded int array");
\r
494 static final int getInt(String s, int i) {
\r
495 return (((int) s.charAt(2*i)) << 16) | (int) s.charAt(2*i+1);
\r
499 * Construct an array of shorts from a run-length encoded string.
\r
501 static public final short[] RLEStringToShortArray(String s) {
\r
502 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
\r
503 short[] array = new short[length];
\r
505 for (int i=2; i<s.length(); ++i) {
\r
506 char c = s.charAt(i);
\r
510 array[ai++] = (short) c;
\r
512 int runLength = (int) c;
\r
513 short runValue = (short) s.charAt(++i);
\r
514 for (int j=0; j<runLength; ++j) array[ai++] = runValue;
\r
518 array[ai++] = (short) c;
\r
523 throw new IllegalStateException("Bad run-length encoded short array");
\r
529 * Construct an array of shorts from a run-length encoded string.
\r
531 static public final char[] RLEStringToCharArray(String s) {
\r
532 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
\r
533 char[] array = new char[length];
\r
535 for (int i=2; i<s.length(); ++i) {
\r
536 char c = s.charAt(i);
\r
542 int runLength = (int) c;
\r
543 char runValue = s.charAt(++i);
\r
544 for (int j=0; j<runLength; ++j) array[ai++] = runValue;
\r
553 throw new IllegalStateException("Bad run-length encoded short array");
\r
559 * Construct an array of bytes from a run-length encoded string.
\r
561 static public final byte[] RLEStringToByteArray(String s) {
\r
562 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
\r
563 byte[] array = new byte[length];
\r
564 boolean nextChar = true;
\r
569 for (int ai=0; ai<length; ) {
\r
570 // This part of the loop places the next byte into the local
\r
571 // variable 'b' each time through the loop. It keeps the
\r
572 // current character in 'c' and uses the boolean 'nextChar'
\r
573 // to see if we've taken both bytes out of 'c' yet.
\r
577 b = (byte) (c >> 8);
\r
581 b = (byte) (c & 0xFF);
\r
585 // This part of the loop is a tiny state machine which handles
\r
586 // the parsing of the run-length encoding. This would be simpler
\r
587 // if we could look ahead, but we can't, so we use 'node' to
\r
588 // move between three nodes in the state machine.
\r
591 // Normal idle node
\r
592 if (b == ESCAPE_BYTE) {
\r
600 // We have seen one ESCAPE_BYTE; we expect either a second
\r
601 // one, or a run length and value.
\r
602 if (b == ESCAPE_BYTE) {
\r
603 array[ai++] = ESCAPE_BYTE;
\r
608 // Interpret signed byte as unsigned
\r
609 if (runLength < 0) runLength += 0x100;
\r
614 // We have seen an ESCAPE_BYTE and length byte. We interpret
\r
615 // the next byte as the value to be repeated.
\r
616 for (int j=0; j<runLength; ++j) array[ai++] = b;
\r
623 throw new IllegalStateException("Bad run-length encoded byte array");
\r
625 if (i != s.length())
\r
626 throw new IllegalStateException("Excess data in RLE byte array string");
\r
631 static public String LINE_SEPARATOR = System.getProperty("line.separator");
\r
634 * Format a String for representation in a source file. This includes
\r
635 * breaking it into lines and escaping characters using octal notation
\r
636 * when necessary (control characters and double quotes).
\r
638 static public final String formatForSource(String s) {
\r
639 StringBuilder buffer = new StringBuilder();
\r
640 for (int i=0; i<s.length();) {
\r
641 if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
\r
642 buffer.append(" \"");
\r
644 while (i<s.length() && count<80) {
\r
645 char c = s.charAt(i++);
\r
646 if (c < '\u0020' || c == '"' || c == '\\') {
\r
648 buffer.append("\\n");
\r
650 } else if (c == '\t') {
\r
651 buffer.append("\\t");
\r
653 } else if (c == '\r') {
\r
654 buffer.append("\\r");
\r
657 // Represent control characters, backslash and double quote
\r
658 // using octal notation; otherwise the string we form
\r
659 // won't compile, since Unicode escape sequences are
\r
660 // processed before tokenization.
\r
661 buffer.append('\\');
\r
662 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
\r
663 buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
\r
664 buffer.append(HEX_DIGIT[(c & 0007)]);
\r
668 else if (c <= '\u007E') {
\r
673 buffer.append("\\u");
\r
674 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
\r
675 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
\r
676 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
\r
677 buffer.append(HEX_DIGIT[(c & 0x000F)]);
\r
681 buffer.append('"');
\r
683 return buffer.toString();
\r
686 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
\r
687 '8','9','A','B','C','D','E','F'};
\r
690 * Format a String for representation in a source file. Like
\r
691 * formatForSource but does not do line breaking.
\r
693 static public final String format1ForSource(String s) {
\r
694 StringBuilder buffer = new StringBuilder();
\r
695 buffer.append("\"");
\r
696 for (int i=0; i<s.length();) {
\r
697 char c = s.charAt(i++);
\r
698 if (c < '\u0020' || c == '"' || c == '\\') {
\r
700 buffer.append("\\n");
\r
701 } else if (c == '\t') {
\r
702 buffer.append("\\t");
\r
703 } else if (c == '\r') {
\r
704 buffer.append("\\r");
\r
706 // Represent control characters, backslash and double quote
\r
707 // using octal notation; otherwise the string we form
\r
708 // won't compile, since Unicode escape sequences are
\r
709 // processed before tokenization.
\r
710 buffer.append('\\');
\r
711 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
\r
712 buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
\r
713 buffer.append(HEX_DIGIT[(c & 0007)]);
\r
716 else if (c <= '\u007E') {
\r
720 buffer.append("\\u");
\r
721 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
\r
722 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
\r
723 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
\r
724 buffer.append(HEX_DIGIT[(c & 0x000F)]);
\r
727 buffer.append('"');
\r
728 return buffer.toString();
\r
732 * Convert characters outside the range U+0020 to U+007F to
\r
733 * Unicode escapes, and convert backslash to a double backslash.
\r
735 public static final String escape(String s) {
\r
736 StringBuilder buf = new StringBuilder();
\r
737 for (int i=0; i<s.length(); ) {
\r
738 int c = Character.codePointAt(s, i);
\r
739 i += UTF16.getCharCount(c);
\r
740 if (c >= ' ' && c <= 0x007F) {
\r
742 buf.append("\\\\"); // That is, "\\"
\r
744 buf.append((char)c);
\r
747 boolean four = c <= 0xFFFF;
\r
748 buf.append(four ? "\\u" : "\\U");
\r
749 buf.append(hex(c, four ? 4 : 8));
\r
752 return buf.toString();
\r
755 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
\r
756 static private final char[] UNESCAPE_MAP = {
\r
772 * Convert an escape to a 32-bit code point value. We attempt
\r
773 * to parallel the icu4c unescapeAt() function.
\r
774 * @param offset16 an array containing offset to the character
\r
775 * <em>after</em> the backslash. Upon return offset16[0] will
\r
776 * be updated to point after the escape sequence.
\r
777 * @return character value from 0 to 10FFFF, or -1 on error.
\r
779 public static int unescapeAt(String s, int[] offset16) {
\r
785 int bitsPerDigit = 4;
\r
788 boolean braces = false;
\r
790 /* Check that offset is in range */
\r
791 int offset = offset16[0];
\r
792 int length = s.length();
\r
793 if (offset < 0 || offset >= length) {
\r
797 /* Fetch first UChar after '\\' */
\r
798 c = Character.codePointAt(s, offset);
\r
799 offset += UTF16.getCharCount(c);
\r
801 /* Convert hexadecimal and octal escapes */
\r
804 minDig = maxDig = 4;
\r
807 minDig = maxDig = 8;
\r
811 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
\r
820 dig = UCharacter.digit(c, 8);
\r
824 n = 1; /* Already have first octal digit */
\r
831 while (offset < length && n < maxDig) {
\r
832 c = UTF16.charAt(s, offset);
\r
833 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
\r
837 result = (result << bitsPerDigit) | dig;
\r
838 offset += UTF16.getCharCount(c);
\r
845 if (c != 0x7D /*}*/) {
\r
850 if (result < 0 || result >= 0x110000) {
\r
853 // If an escape sequence specifies a lead surrogate, see
\r
854 // if there is a trail surrogate after it, either as an
\r
855 // escape or as a literal. If so, join them up into a
\r
857 if (offset < length &&
\r
858 UTF16.isLeadSurrogate((char) result)) {
\r
859 int ahead = offset+1;
\r
860 c = s.charAt(offset); // [sic] get 16-bit code unit
\r
861 if (c == '\\' && ahead < length) {
\r
862 int o[] = new int[] { ahead };
\r
863 c = unescapeAt(s, o);
\r
866 if (UTF16.isTrailSurrogate((char) c)) {
\r
868 result = UCharacterProperty.getRawSupplementary(
\r
869 (char) result, (char) c);
\r
872 offset16[0] = offset;
\r
876 /* Convert C-style escapes in table */
\r
877 for (i=0; i<UNESCAPE_MAP.length; i+=2) {
\r
878 if (c == UNESCAPE_MAP[i]) {
\r
879 offset16[0] = offset;
\r
880 return UNESCAPE_MAP[i+1];
\r
881 } else if (c < UNESCAPE_MAP[i]) {
\r
886 /* Map \cX to control-X: X & 0x1F */
\r
887 if (c == 'c' && offset < length) {
\r
888 c = UTF16.charAt(s, offset);
\r
889 offset16[0] = offset + UTF16.getCharCount(c);
\r
893 /* If no special forms are recognized, then consider
\r
894 * the backslash to generically escape the next character. */
\r
895 offset16[0] = offset;
\r
900 * Convert all escapes in a given string using unescapeAt().
\r
901 * @exception IllegalArgumentException if an invalid escape is
\r
904 public static String unescape(String s) {
\r
905 StringBuilder buf = new StringBuilder();
\r
906 int[] pos = new int[1];
\r
907 for (int i=0; i<s.length(); ) {
\r
908 char c = s.charAt(i++);
\r
911 int e = unescapeAt(s, pos);
\r
913 throw new IllegalArgumentException("Invalid escape sequence " +
\r
914 s.substring(i-1, Math.min(i+8, s.length())));
\r
916 buf.appendCodePoint(e);
\r
922 return buf.toString();
\r
926 * Convert all escapes in a given string using unescapeAt().
\r
927 * Leave invalid escape sequences unchanged.
\r
929 public static String unescapeLeniently(String s) {
\r
930 StringBuilder buf = new StringBuilder();
\r
931 int[] pos = new int[1];
\r
932 for (int i=0; i<s.length(); ) {
\r
933 char c = s.charAt(i++);
\r
936 int e = unescapeAt(s, pos);
\r
940 buf.appendCodePoint(e);
\r
947 return buf.toString();
\r
951 * Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
\r
954 public static String hex(long ch) {
\r
959 * Supplies a zero-padded hex representation of an integer (without 0x)
\r
961 static public String hex(long i, int places) {
\r
962 if (i == Long.MIN_VALUE) return "-8000000000000000";
\r
963 boolean negative = i < 0;
\r
967 String result = Long.toString(i, 16).toUpperCase();
\r
968 if (result.length() < places) {
\r
969 result = "0000000000000000".substring(result.length(),places) + result;
\r
972 return '-' + result;
\r
978 * Convert a string to comma-separated groups of 4 hex uppercase
\r
979 * digits. E.g., hex('ab') => "0041,0042".
\r
981 public static String hex(CharSequence s) {
\r
982 return hex(s, 4, ",", true, new StringBuilder()).toString();
\r
986 * Convert a string to separated groups of hex uppercase
\r
987 * digits. E.g., hex('ab'...) => "0041,0042". Append the output
\r
988 * to the given Appendable.
\r
990 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
\r
992 if (useCodePoints) {
\r
994 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
\r
995 cp = Character.codePointAt(s, i);
\r
997 result.append(separator);
\r
999 result.append(hex(cp,width));
\r
1002 for (int i = 0; i < s.length(); ++i) {
\r
1004 result.append(separator);
\r
1006 result.append(hex(s.charAt(i),width));
\r
1010 } catch (IOException e) {
\r
1011 throw new IllegalIcuArgumentException(e);
\r
1017 * Convert a string to comma-separated groups of 4 hex uppercase
\r
1018 * digits. E.g., hex('ab') => "0041,0042".
\r
1020 public static <S extends CharSequence> String hex(S s, int width, S separator) {
\r
1021 return hex(s, width, separator, true, new StringBuilder()).toString();
\r
1025 * Split a string into pieces based on the given divider character
\r
1026 * @param s the string to split
\r
1027 * @param divider the character on which to split. Occurrences of
\r
1028 * this character are not included in the output
\r
1029 * @param output an array to receive the substrings between
\r
1030 * instances of divider. It must be large enough on entry to
\r
1031 * accomodate all output. Adjacent instances of the divider
\r
1032 * character will place empty strings into output. Before
\r
1033 * returning, output is padded out with empty strings.
\r
1035 public static void split(String s, char divider, String[] output) {
\r
1039 for (i = 0; i < s.length(); ++i) {
\r
1040 if (s.charAt(i) == divider) {
\r
1041 output[current++] = s.substring(last,i);
\r
1045 output[current++] = s.substring(last,i);
\r
1046 while (current < output.length) {
\r
1047 output[current++] = "";
\r
1052 * Split a string into pieces based on the given divider character
\r
1053 * @param s the string to split
\r
1054 * @param divider the character on which to split. Occurrences of
\r
1055 * this character are not included in the output
\r
1056 * @return output an array to receive the substrings between
\r
1057 * instances of divider. Adjacent instances of the divider
\r
1058 * character will place empty strings into output.
\r
1060 public static String[] split(String s, char divider) {
\r
1063 ArrayList<String> output = new ArrayList<String>();
\r
1064 for (i = 0; i < s.length(); ++i) {
\r
1065 if (s.charAt(i) == divider) {
\r
1066 output.add(s.substring(last,i));
\r
1070 output.add( s.substring(last,i));
\r
1071 return output.toArray(new String[output.size()]);
\r
1075 * Look up a given string in a string array. Returns the index at
\r
1076 * which the first occurrence of the string was found in the
\r
1077 * array, or -1 if it was not found.
\r
1078 * @param source the string to search for
\r
1079 * @param target the array of zero or more strings in which to
\r
1081 * @return the index of target at which source first occurs, or -1
\r
1084 public static int lookup(String source, String[] target) {
\r
1085 for (int i = 0; i < target.length; ++i) {
\r
1086 if (source.equals(target[i])) return i;
\r
1092 * Skip over a sequence of zero or more white space characters
\r
1093 * at pos. Return the index of the first non-white-space character
\r
1094 * at or after pos, or str.length(), if there is none.
\r
1096 public static int skipWhitespace(String str, int pos) {
\r
1097 while (pos < str.length()) {
\r
1098 int c = Character.codePointAt(str, pos);
\r
1099 if (!UCharacterProperty.isRuleWhiteSpace(c)) {
\r
1102 pos += UTF16.getCharCount(c);
\r
1108 * Skip over a sequence of zero or more white space characters
\r
1109 * at pos[0], advancing it.
\r
1111 public static void skipWhitespace(String str, int[] pos) {
\r
1112 pos[0] = skipWhitespace(str, pos[0]);
\r
1116 * Remove all rule white space from a string.
\r
1118 public static String deleteRuleWhiteSpace(String str) {
\r
1119 StringBuilder buf = new StringBuilder();
\r
1120 for (int i=0; i<str.length(); ) {
\r
1121 int ch = Character.codePointAt(str, i);
\r
1122 i += UTF16.getCharCount(ch);
\r
1123 if (UCharacterProperty.isRuleWhiteSpace(ch)) {
\r
1126 buf.appendCodePoint(ch);
\r
1128 return buf.toString();
\r
1132 * Parse a single non-whitespace character 'ch', optionally
\r
1133 * preceded by whitespace.
\r
1134 * @param id the string to be parsed
\r
1135 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
\r
1136 * offset of the first character to be parsed. On output, pos[0]
\r
1137 * is the index after the last parsed character. If the parse
\r
1138 * fails, pos[0] will be unchanged.
\r
1139 * @param ch the non-whitespace character to be parsed.
\r
1140 * @return true if 'ch' is seen preceded by zero or more
\r
1141 * whitespace characters.
\r
1143 public static boolean parseChar(String id, int[] pos, char ch) {
\r
1144 int start = pos[0];
\r
1145 skipWhitespace(id, pos);
\r
1146 if (pos[0] == id.length() ||
\r
1147 id.charAt(pos[0]) != ch) {
\r
1156 * Parse a pattern string starting at offset pos. Keywords are
\r
1157 * matched case-insensitively. Spaces may be skipped and may be
\r
1158 * optional or required. Integer values may be parsed, and if
\r
1159 * they are, they will be returned in the given array. If
\r
1160 * successful, the offset of the next non-space character is
\r
1161 * returned. On failure, -1 is returned.
\r
1162 * @param pattern must only contain lowercase characters, which
\r
1163 * will match their uppercase equivalents as well. A space
\r
1164 * character matches one or more required spaces. A '~' character
\r
1165 * matches zero or more optional spaces. A '#' character matches
\r
1166 * an integer and stores it in parsedInts, which the caller must
\r
1167 * ensure has enough capacity.
\r
1168 * @param parsedInts array to receive parsed integers. Caller
\r
1169 * must ensure that parsedInts.length is >= the number of '#'
\r
1170 * signs in 'pattern'.
\r
1171 * @return the position after the last character parsed, or -1 if
\r
1172 * the parse failed
\r
1174 @SuppressWarnings("fallthrough")
\r
1175 public static int parsePattern(String rule, int pos, int limit,
\r
1176 String pattern, int[] parsedInts) {
\r
1177 // TODO Update this to handle surrogates
\r
1178 int[] p = new int[1];
\r
1179 int intCount = 0; // number of integers parsed
\r
1180 for (int i=0; i<pattern.length(); ++i) {
\r
1181 char cpat = pattern.charAt(i);
\r
1185 if (pos >= limit) {
\r
1188 c = rule.charAt(pos++);
\r
1189 if (!UCharacterProperty.isRuleWhiteSpace(c)) {
\r
1192 // FALL THROUGH to skipWhitespace
\r
1194 pos = skipWhitespace(rule, pos);
\r
1198 parsedInts[intCount++] = parseInteger(rule, p, limit);
\r
1199 if (p[0] == pos) {
\r
1200 // Syntax error; failed to parse integer
\r
1206 if (pos >= limit) {
\r
1209 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
\r
1220 * Parse a pattern string within the given Replaceable and a parsing
\r
1221 * pattern. Characters are matched literally and case-sensitively
\r
1222 * except for the following special characters:
\r
1224 * ~ zero or more uprv_isRuleWhiteSpace chars
\r
1226 * If end of pattern is reached with all matches along the way,
\r
1227 * pos is advanced to the first unparsed index and returned.
\r
1228 * Otherwise -1 is returned.
\r
1229 * @param pat pattern that controls parsing
\r
1230 * @param text text to be parsed, starting at index
\r
1231 * @param index offset to first character to parse
\r
1232 * @param limit offset after last character to parse
\r
1233 * @return index after last parsed character, or -1 on parse failure.
\r
1235 public static int parsePattern(String pat,
\r
1241 // empty pattern matches immediately
\r
1242 if (ipat == pat.length()) {
\r
1246 int cpat = Character.codePointAt(pat, ipat);
\r
1248 while (index < limit) {
\r
1249 int c = text.char32At(index);
\r
1252 if (cpat == '~') {
\r
1253 if (UCharacterProperty.isRuleWhiteSpace(c)) {
\r
1254 index += UTF16.getCharCount(c);
\r
1257 if (++ipat == pat.length()) {
\r
1258 return index; // success; c unparsed
\r
1260 // fall thru; process c again with next cpat
\r
1265 else if (c == cpat) {
\r
1266 int n = UTF16.getCharCount(c);
\r
1269 if (ipat == pat.length()) {
\r
1270 return index; // success; c parsed
\r
1272 // fall thru; get next cpat
\r
1275 // match failure of literal
\r
1280 cpat = UTF16.charAt(pat, ipat);
\r
1283 return -1; // text ended before end of pat
\r
1287 * Parse an integer at pos, either of the form \d+ or of the form
\r
1288 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
\r
1289 * or octal format.
\r
1290 * @param pos INPUT-OUTPUT parameter. On input, the first
\r
1291 * character to parse. On output, the character after the last
\r
1292 * parsed character.
\r
1294 public static int parseInteger(String rule, int[] pos, int limit) {
\r
1300 if (rule.regionMatches(true, p, "0x", 0, 2)) {
\r
1303 } else if (p < limit && rule.charAt(p) == '0') {
\r
1309 while (p < limit) {
\r
1310 int d = UCharacter.digit(rule.charAt(p++), radix);
\r
1316 int v = (value * radix) + d;
\r
1318 // If there are too many input digits, at some point
\r
1319 // the value will go negative, e.g., if we have seen
\r
1320 // "0x8000000" already and there is another '0', when
\r
1321 // we parse the next 0 the value will go negative.
\r
1333 * Parse a Unicode identifier from the given string at the given
\r
1334 * position. Return the identifier, or null if there is no
\r
1336 * @param str the string to parse
\r
1337 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the
\r
1338 * first character to examine. It must be less than str.length(),
\r
1339 * and it must not point to a whitespace character. That is, must
\r
1340 * have pos[0] < str.length() and
\r
1341 * !UCharacterProperty.isRuleWhiteSpace(UTF16.charAt(str, pos[0])). On
\r
1342 * OUTPUT, the position after the last parsed character.
\r
1343 * @return the Unicode identifier, or null if there is no valid
\r
1344 * identifier at pos[0].
\r
1346 public static String parseUnicodeIdentifier(String str, int[] pos) {
\r
1347 // assert(pos[0] < str.length());
\r
1348 // assert(!UCharacterProperty.isRuleWhiteSpace(UTF16.charAt(str, pos[0])));
\r
1349 StringBuilder buf = new StringBuilder();
\r
1351 while (p < str.length()) {
\r
1352 int ch = Character.codePointAt(str, p);
\r
1353 if (buf.length() == 0) {
\r
1354 if (UCharacter.isUnicodeIdentifierStart(ch)) {
\r
1355 buf.appendCodePoint(ch);
\r
1360 if (UCharacter.isUnicodeIdentifierPart(ch)) {
\r
1361 buf.appendCodePoint(ch);
\r
1366 p += UTF16.getCharCount(ch);
\r
1369 return buf.toString();
\r
1372 static final char DIGITS[] = {
\r
1373 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
\r
1374 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
\r
1375 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
\r
1376 'U', 'V', 'W', 'X', 'Y', 'Z'
\r
1380 * Append the digits of a positive integer to the given
\r
1381 * <code>Appendable</code> in the given radix. This is
\r
1382 * done recursively since it is easiest to generate the low-
\r
1383 * order digit first, but it must be appended last.
\r
1385 * @param result is the <code>Appendable</code> to append to
\r
1386 * @param n is the positive integer
\r
1387 * @param radix is the radix, from 2 to 36 inclusive
\r
1388 * @param minDigits is the minimum number of digits to append.
\r
1390 private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
\r
1391 int radix, int minDigits)
\r
1394 int digit = n % radix;
\r
1396 if (n >= radix || minDigits > 1) {
\r
1397 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
\r
1399 result.append(DIGITS[digit]);
\r
1400 } catch (IOException e) {
\r
1401 throw new IllegalIcuArgumentException(e);
\r
1406 * Append a number to the given Appendable in the given radix.
\r
1407 * Standard digits '0'-'9' are used and letters 'A'-'Z' for
\r
1408 * radices 11 through 36.
\r
1409 * @param result the digits of the number are appended here
\r
1410 * @param n the number to be converted to digits; may be negative.
\r
1411 * If negative, a '-' is prepended to the digits.
\r
1412 * @param radix a radix from 2 to 36 inclusive.
\r
1413 * @param minDigits the minimum number of digits, not including
\r
1414 * any '-', to produce. Values less than 2 have no effect. One
\r
1415 * digit is always emitted regardless of this parameter.
\r
1416 * @return a reference to result
\r
1418 public static <T extends Appendable> T appendNumber(T result, int n,
\r
1419 int radix, int minDigits)
\r
1422 if (radix < 2 || radix > 36) {
\r
1423 throw new IllegalArgumentException("Illegal radix " + radix);
\r
1431 result.append("-");
\r
1434 recursiveAppendNumber(result, abs, radix, minDigits);
\r
1437 } catch (IOException e) {
\r
1438 throw new IllegalIcuArgumentException(e);
\r
1444 * Parse an unsigned 31-bit integer at the given offset. Use
\r
1445 * UCharacter.digit() to parse individual characters into digits.
\r
1446 * @param text the text to be parsed
\r
1447 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
\r
1448 * offset within text at which to start parsing; it should point
\r
1449 * to a valid digit. On exit, pos[0] is the offset after the last
\r
1450 * parsed character. If the parse failed, it will be unchanged on
\r
1451 * exit. Must be >= 0 on entry.
\r
1452 * @param radix the radix in which to parse; must be >= 2 and <=
\r
1454 * @return a non-negative parsed number, or -1 upon parse failure.
\r
1455 * Parse fails if there are no digits, that is, if pos[0] does not
\r
1456 * point to a valid digit on entry, or if the number to be parsed
\r
1457 * does not fit into a 31-bit unsigned integer.
\r
1459 public static int parseNumber(String text, int[] pos, int radix) {
\r
1460 // assert(pos[0] >= 0);
\r
1461 // assert(radix >= 2);
\r
1462 // assert(radix <= 36);
\r
1465 while (p < text.length()) {
\r
1466 int ch = Character.codePointAt(text, p);
\r
1467 int d = UCharacter.digit(ch, radix);
\r
1472 // ASSUME that when a 32-bit integer overflows it becomes
\r
1473 // negative. E.g., 214748364 * 10 + 8 => negative value.
\r
1479 if (p == pos[0]) {
\r
1487 * Return true if the character is NOT printable ASCII. The tab,
\r
1488 * newline and linefeed characters are considered unprintable.
\r
1490 public static boolean isUnprintable(int c) {
\r
1491 //0x20 = 32 and 0x7E = 126
\r
1492 return !(c >= 0x20 && c <= 0x7E);
\r
1496 * Escape unprintable characters using <backslash>uxxxx notation
\r
1497 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
\r
1498 * above. If the character is printable ASCII, then do nothing
\r
1499 * and return FALSE. Otherwise, append the escaped notation and
\r
1502 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
\r
1504 if (isUnprintable(c)) {
\r
1505 result.append('\\');
\r
1506 if ((c & ~0xFFFF) != 0) {
\r
1507 result.append('U');
\r
1508 result.append(DIGITS[0xF&(c>>28)]);
\r
1509 result.append(DIGITS[0xF&(c>>24)]);
\r
1510 result.append(DIGITS[0xF&(c>>20)]);
\r
1511 result.append(DIGITS[0xF&(c>>16)]);
\r
1513 result.append('u');
\r
1515 result.append(DIGITS[0xF&(c>>12)]);
\r
1516 result.append(DIGITS[0xF&(c>>8)]);
\r
1517 result.append(DIGITS[0xF&(c>>4)]);
\r
1518 result.append(DIGITS[0xF&c]);
\r
1522 } catch (IOException e) {
\r
1523 throw new IllegalIcuArgumentException(e);
\r
1528 * Returns the index of the first character in a set, ignoring quoted text.
\r
1529 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
\r
1530 * found by a search for "h". Unlike String.indexOf(), this method searches
\r
1531 * not for a single character, but for any character of the string
\r
1532 * <code>setOfChars</code>.
\r
1533 * @param text text to be searched
\r
1534 * @param start the beginning index, inclusive; <code>0 <= start
\r
1535 * <= limit</code>.
\r
1536 * @param limit the ending index, exclusive; <code>start <= limit
\r
1537 * <= text.length()</code>.
\r
1538 * @param setOfChars string with one or more distinct characters
\r
1539 * @return Offset of the first character in <code>setOfChars</code>
\r
1540 * found, or -1 if not found.
\r
1541 * @see String#indexOf
\r
1543 public static int quotedIndexOf(String text, int start, int limit,
\r
1544 String setOfChars) {
\r
1545 for (int i=start; i<limit; ++i) {
\r
1546 char c = text.charAt(i);
\r
1547 if (c == BACKSLASH) {
\r
1549 } else if (c == APOSTROPHE) {
\r
1550 while (++i < limit
\r
1551 && text.charAt(i) != APOSTROPHE) {}
\r
1552 } else if (setOfChars.indexOf(c) >= 0) {
\r
1560 * Append a character to a rule that is being built up. To flush
\r
1561 * the quoteBuf to rule, make one final call with isLiteral == true.
\r
1562 * If there is no final character, pass in (int)-1 as c.
\r
1563 * @param rule the string to append the character to
\r
1564 * @param c the character to append, or (int)-1 if none.
\r
1565 * @param isLiteral if true, then the given character should not be
\r
1566 * quoted or escaped. Usually this means it is a syntactic element
\r
1568 * @param escapeUnprintable if true, then unprintable characters
\r
1569 * should be escaped using escapeUnprintable(). These escapes will
\r
1570 * appear outside of quotes.
\r
1571 * @param quoteBuf a buffer which is used to build up quoted
\r
1572 * substrings. The caller should initially supply an empty buffer,
\r
1573 * and thereafter should not modify the buffer. The buffer should be
\r
1574 * cleared out by, at the end, calling this method with a literal
\r
1575 * character (which may be -1).
\r
1577 public static void appendToRule(StringBuffer rule,
\r
1579 boolean isLiteral,
\r
1580 boolean escapeUnprintable,
\r
1581 StringBuffer quoteBuf) {
\r
1582 // If we are escaping unprintables, then escape them outside
\r
1583 // quotes. \\u and \\U are not recognized within quotes. The same
\r
1584 // logic applies to literals, but literals are never escaped.
\r
1586 (escapeUnprintable && Utility.isUnprintable(c))) {
\r
1587 if (quoteBuf.length() > 0) {
\r
1588 // We prefer backslash APOSTROPHE to double APOSTROPHE
\r
1589 // (more readable, less similar to ") so if there are
\r
1590 // double APOSTROPHEs at the ends, we pull them outside
\r
1593 // If the first thing in the quoteBuf is APOSTROPHE
\r
1594 // (doubled) then pull it out.
\r
1595 while (quoteBuf.length() >= 2 &&
\r
1596 quoteBuf.charAt(0) == APOSTROPHE &&
\r
1597 quoteBuf.charAt(1) == APOSTROPHE) {
\r
1598 rule.append(BACKSLASH).append(APOSTROPHE);
\r
1599 quoteBuf.delete(0, 2);
\r
1601 // If the last thing in the quoteBuf is APOSTROPHE
\r
1602 // (doubled) then remove and count it and add it after.
\r
1603 int trailingCount = 0;
\r
1604 while (quoteBuf.length() >= 2 &&
\r
1605 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
\r
1606 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
\r
1607 quoteBuf.setLength(quoteBuf.length()-2);
\r
1610 if (quoteBuf.length() > 0) {
\r
1611 rule.append(APOSTROPHE);
\r
1612 rule.append(quoteBuf);
\r
1613 rule.append(APOSTROPHE);
\r
1614 quoteBuf.setLength(0);
\r
1616 while (trailingCount-- > 0) {
\r
1617 rule.append(BACKSLASH).append(APOSTROPHE);
\r
1621 /* Since spaces are ignored during parsing, they are
\r
1622 * emitted only for readability. We emit one here
\r
1623 * only if there isn't already one at the end of the
\r
1627 int len = rule.length();
\r
1628 if (len > 0 && rule.charAt(len-1) != ' ') {
\r
1631 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
\r
1632 rule.appendCodePoint(c);
\r
1637 // Escape ' and '\' and don't begin a quote just for them
\r
1638 else if (quoteBuf.length() == 0 &&
\r
1639 (c == APOSTROPHE || c == BACKSLASH)) {
\r
1640 rule.append(BACKSLASH).append((char)c);
\r
1643 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
\r
1644 // whitespace need quoting. Also append stuff to quotes if we are
\r
1645 // building up a quoted substring already.
\r
1646 else if (quoteBuf.length() > 0 ||
\r
1647 (c >= 0x0021 && c <= 0x007E &&
\r
1648 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
\r
1649 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
\r
1650 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
\r
1651 UCharacterProperty.isRuleWhiteSpace(c)) {
\r
1652 quoteBuf.appendCodePoint(c);
\r
1653 // Double ' within a quote
\r
1654 if (c == APOSTROPHE) {
\r
1655 quoteBuf.append((char)c);
\r
1659 // Otherwise just append
\r
1661 rule.appendCodePoint(c);
\r
1666 * Append the given string to the rule. Calls the single-character
\r
1667 * version of appendToRule for each character.
\r
1669 public static void appendToRule(StringBuffer rule,
\r
1671 boolean isLiteral,
\r
1672 boolean escapeUnprintable,
\r
1673 StringBuffer quoteBuf) {
\r
1674 for (int i=0; i<text.length(); ++i) {
\r
1675 // Okay to process in 16-bit code units here
\r
1676 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
\r
1681 * Given a matcher reference, which may be null, append its
\r
1682 * pattern as a literal to the given rule.
\r
1684 public static void appendToRule(StringBuffer rule,
\r
1685 UnicodeMatcher matcher,
\r
1686 boolean escapeUnprintable,
\r
1687 StringBuffer quoteBuf) {
\r
1688 if (matcher != null) {
\r
1689 appendToRule(rule, matcher.toPattern(escapeUnprintable),
\r
1690 true, escapeUnprintable, quoteBuf);
\r
1695 * Compares 2 unsigned integers
\r
1696 * @param source 32 bit unsigned integer
\r
1697 * @param target 32 bit unsigned integer
\r
1698 * @return 0 if equals, 1 if source is greater than target and -1
\r
1701 public static final int compareUnsigned(int source, int target)
\r
1703 source += MAGIC_UNSIGNED;
\r
1704 target += MAGIC_UNSIGNED;
\r
1705 if (source < target) {
\r
1708 else if (source > target) {
\r
1715 * Find the highest bit in a positive integer. This is done
\r
1716 * by doing a binary search through the bits.
\r
1718 * @param n is the integer
\r
1720 * @return the bit number of the highest bit, with 0 being
\r
1721 * the low order bit, or -1 if <code>n</code> is not positive
\r
1723 public static final byte highBit(int n)
\r
1731 if (n >= 1 << 16) {
\r
1736 if (n >= 1 << 8) {
\r
1741 if (n >= 1 << 4) {
\r
1746 if (n >= 1 << 2) {
\r
1751 if (n >= 1 << 1) {
\r
1759 * Utility method to take a int[] containing codepoints and return
\r
1760 * a string representation with code units.
\r
1762 public static String valueOf(int[]source){
\r
1763 // TODO: Investigate why this method is not on UTF16 class
\r
1764 StringBuilder result = new StringBuilder(source.length);
\r
1765 for(int i=0; i<source.length; i++){
\r
1766 result.appendCodePoint(source[i]);
\r
1768 return result.toString();
\r
1773 * Utility to duplicate a string count times
\r
1774 * @param s String to be duplicated.
\r
1775 * @param count Number of times to duplicate a string.
\r
1777 public static String repeat(String s, int count) {
\r
1778 if (count <= 0) return "";
\r
1779 if (count == 1) return s;
\r
1780 StringBuilder result = new StringBuilder();
\r
1781 for (int i = 0; i < count; ++i) {
\r
1784 return result.toString();
\r
1787 public static String[] splitString(String src, String target) {
\r
1788 return src.split("\\Q" + target + "\\E");
\r
1792 * Split the string at runs of ascii whitespace characters.
\r
1794 public static String[] splitWhitespace(String src) {
\r
1795 return src.split("\\s+");
\r
1799 * Parse a list of hex numbers and return a string
\r
1800 * @param string String of hex numbers.
\r
1801 * @param minLength Minimal length.
\r
1802 * @param separator Seperator.
\r
1803 * @return A string from hex numbers.
\r
1805 public static String fromHex(String string, int minLength, String separator) {
\r
1806 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
\r
1810 * Parse a list of hex numbers and return a string
\r
1811 * @param string String of hex numbers.
\r
1812 * @param minLength Minimal length.
\r
1813 * @param separator Seperator.
\r
1814 * @return A string from hex numbers.
\r
1816 public static String fromHex(String string, int minLength, Pattern separator) {
\r
1817 StringBuilder buffer = new StringBuilder();
\r
1818 String[] parts = separator.split(string);
\r
1819 for (String part : parts) {
\r
1820 if (part.length() < minLength) {
\r
1821 throw new IllegalArgumentException("code point too short: " + part);
\r
1823 int cp = Integer.parseInt(part, 16);
\r
1824 buffer.appendCodePoint(cp);
\r
1826 return buffer.toString();
\r
1830 * Return a fallback class loader for loading ICU resource
\r
1831 * @return A class loader
\r
1833 public static ClassLoader getFallbackClassLoader() {
\r
1834 ClassLoader cl = Thread.currentThread().getContextClassLoader();
\r
1836 cl = ClassLoader.getSystemClassLoader();
\r
1838 //TODO It is not guaranteed that we can get non-null class loader
\r
1839 // by the Java specification.
\r
1840 throw new RuntimeException("No accessible class loader is available.");
\r