2 *******************************************************************************
3 * Copyright (C) 1996-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.impl;
9 import java.io.IOException;
10 import java.util.ArrayList;
11 import java.util.Locale;
12 import java.util.regex.Pattern;
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.text.Replaceable;
16 import com.ibm.icu.text.UTF16;
17 import com.ibm.icu.text.UnicodeMatcher;
19 public final class Utility {
21 private static final char APOSTROPHE = '\'';
22 private static final char BACKSLASH = '\\';
23 private static final int MAGIC_UNSIGNED = 0x80000000;
26 * Convenience utility to compare two Object[]s.
27 * Ought to be in System
29 public final static boolean arrayEquals(Object[] source, Object target) {
30 if (source == null) return (target == null);
31 if (!(target instanceof Object[])) return false;
32 Object[] targ = (Object[]) target;
33 return (source.length == targ.length
34 && arrayRegionMatches(source, 0, targ, 0, source.length));
38 * Convenience utility to compare two int[]s
39 * Ought to be in System
41 public final static boolean arrayEquals(int[] source, Object target) {
42 if (source == null) return (target == null);
43 if (!(target instanceof int[])) return false;
44 int[] targ = (int[]) target;
45 return (source.length == targ.length
46 && arrayRegionMatches(source, 0, targ, 0, source.length));
50 * Convenience utility to compare two double[]s
51 * Ought to be in System
53 public final static boolean arrayEquals(double[] source, Object target) {
54 if (source == null) return (target == null);
55 if (!(target instanceof double[])) return false;
56 double[] targ = (double[]) target;
57 return (source.length == targ.length
58 && arrayRegionMatches(source, 0, targ, 0, source.length));
60 public final static boolean arrayEquals(byte[] source, Object target) {
61 if (source == null) return (target == null);
62 if (!(target instanceof byte[])) return false;
63 byte[] targ = (byte[]) target;
64 return (source.length == targ.length
65 && arrayRegionMatches(source, 0, targ, 0, source.length));
69 * Convenience utility to compare two Object[]s
70 * Ought to be in System
72 public final static boolean arrayEquals(Object source, Object target) {
73 if (source == null) return (target == null);
74 // for some reason, the correct arrayEquals is not being called
75 // so do it by hand for now.
76 if (source instanceof Object[])
77 return(arrayEquals((Object[]) source,target));
78 if (source instanceof int[])
79 return(arrayEquals((int[]) source,target));
80 if (source instanceof double[])
81 return(arrayEquals((double[]) source, target));
82 if (source instanceof byte[])
83 return(arrayEquals((byte[]) source,target));
84 return source.equals(target);
88 * Convenience utility to compare two Object[]s
89 * Ought to be in System.
90 * @param len the length to compare.
91 * The start indices and start+len must be valid.
93 public final static boolean arrayRegionMatches(Object[] source, int sourceStart,
94 Object[] target, int targetStart,
97 int sourceEnd = sourceStart + len;
98 int delta = targetStart - sourceStart;
99 for (int i = sourceStart; i < sourceEnd; i++) {
100 if (!arrayEquals(source[i],target[i + delta]))
107 * Convenience utility to compare two Object[]s
108 * Ought to be in System.
109 * @param len the length to compare.
110 * The start indices and start+len must be valid.
112 public final static boolean arrayRegionMatches(char[] source, int sourceStart,
113 char[] target, int targetStart,
116 int sourceEnd = sourceStart + len;
117 int delta = targetStart - sourceStart;
118 for (int i = sourceStart; i < sourceEnd; i++) {
119 if (source[i]!=target[i + delta])
126 * Convenience utility to compare two int[]s.
127 * @param len the length to compare.
128 * The start indices and start+len must be valid.
129 * Ought to be in System
131 public final static boolean arrayRegionMatches(int[] source, int sourceStart,
132 int[] target, int targetStart,
135 int sourceEnd = sourceStart + len;
136 int delta = targetStart - sourceStart;
137 for (int i = sourceStart; i < sourceEnd; i++) {
138 if (source[i] != target[i + delta])
145 * Convenience utility to compare two arrays of doubles.
146 * @param len the length to compare.
147 * The start indices and start+len must be valid.
148 * Ought to be in System
150 public final static boolean arrayRegionMatches(double[] source, int sourceStart,
151 double[] target, int targetStart,
154 int sourceEnd = sourceStart + len;
155 int delta = targetStart - sourceStart;
156 for (int i = sourceStart; i < sourceEnd; i++) {
157 if (source[i] != target[i + delta])
162 public final static boolean arrayRegionMatches(byte[] source, int sourceStart,
163 byte[] target, int targetStart, int len){
164 int sourceEnd = sourceStart + len;
165 int delta = targetStart - sourceStart;
166 for (int i = sourceStart; i < sourceEnd; i++) {
167 if (source[i] != target[i + delta])
174 * Convenience utility. Does null checks on objects, then calls equals.
176 public final static boolean objectEquals(Object a, Object b) {
178 b == null ? true : false :
179 b == null ? false : a.equals(b);
183 * Convenience utility. Does null checks on objects, then calls compare.
185 public static <T extends Comparable<T>> int checkCompare(T a, T b) {
188 b == null ? 1 : a.compareTo(b);
192 * Convenience utility. Does null checks on object, then calls hashCode.
194 public static int checkHash(Object a) {
195 return a == null ? 0 : a.hashCode();
199 * The ESCAPE character is used during run-length encoding. It signals
200 * a run of identical chars.
202 private static final char ESCAPE = '\uA5A5';
205 * The ESCAPE_BYTE character is used during run-length encoding. It signals
206 * a run of identical bytes.
208 static final byte ESCAPE_BYTE = (byte)0xA5;
211 * Construct a string representing an int array. Use run-length encoding.
212 * A character represents itself, unless it is the ESCAPE character. Then
213 * the following notations are possible:
214 * ESCAPE ESCAPE ESCAPE literal
215 * ESCAPE n c n instances of character c
216 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
217 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
218 * If we encounter a run where n == ESCAPE, we represent this as:
220 * The ESCAPE value is chosen so as not to collide with commonly
223 static public final String arrayToRLEString(int[] a) {
224 StringBuilder buffer = new StringBuilder();
226 appendInt(buffer, a.length);
229 for (int i=1; i<a.length; ++i) {
231 if (s == runValue && runLength < 0xFFFF) {
234 encodeRun(buffer, runValue, runLength);
239 encodeRun(buffer, runValue, runLength);
240 return buffer.toString();
244 * Construct a string representing a short array. Use run-length encoding.
245 * A character represents itself, unless it is the ESCAPE character. Then
246 * the following notations are possible:
247 * ESCAPE ESCAPE ESCAPE literal
248 * ESCAPE n c n instances of character c
249 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
250 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
251 * If we encounter a run where n == ESCAPE, we represent this as:
253 * The ESCAPE value is chosen so as not to collide with commonly
256 static public final String arrayToRLEString(short[] a) {
257 StringBuilder buffer = new StringBuilder();
258 // for (int i=0; i<a.length; ++i) buffer.append((char) a[i]);
259 buffer.append((char) (a.length >> 16));
260 buffer.append((char) a.length);
261 short runValue = a[0];
263 for (int i=1; i<a.length; ++i) {
265 if (s == runValue && runLength < 0xFFFF) ++runLength;
267 encodeRun(buffer, runValue, runLength);
272 encodeRun(buffer, runValue, runLength);
273 return buffer.toString();
277 * Construct a string representing a char array. Use run-length encoding.
278 * A character represents itself, unless it is the ESCAPE character. Then
279 * the following notations are possible:
280 * ESCAPE ESCAPE ESCAPE literal
281 * ESCAPE n c n instances of character c
282 * Since an encoded run occupies 3 characters, we only encode runs of 4 or
283 * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
284 * If we encounter a run where n == ESCAPE, we represent this as:
286 * The ESCAPE value is chosen so as not to collide with commonly
289 static public final String arrayToRLEString(char[] a) {
290 StringBuilder buffer = new StringBuilder();
291 buffer.append((char) (a.length >> 16));
292 buffer.append((char) a.length);
293 char runValue = a[0];
295 for (int i=1; i<a.length; ++i) {
297 if (s == runValue && runLength < 0xFFFF) ++runLength;
299 encodeRun(buffer, (short)runValue, runLength);
304 encodeRun(buffer, (short)runValue, runLength);
305 return buffer.toString();
309 * Construct a string representing a byte array. Use run-length encoding.
310 * Two bytes are packed into a single char, with a single extra zero byte at
311 * the end if needed. A byte represents itself, unless it is the
312 * ESCAPE_BYTE. Then the following notations are possible:
313 * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal
314 * ESCAPE_BYTE n b n instances of byte b
315 * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
316 * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
317 * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
318 * b ESCAPE_BYTE n-1 b
319 * The ESCAPE_BYTE value is chosen so as not to collide with commonly
322 static public final String arrayToRLEString(byte[] a) {
323 StringBuilder buffer = new StringBuilder();
324 buffer.append((char) (a.length >> 16));
325 buffer.append((char) a.length);
326 byte runValue = a[0];
328 byte[] state = new byte[2];
329 for (int i=1; i<a.length; ++i) {
331 if (b == runValue && runLength < 0xFF) ++runLength;
333 encodeRun(buffer, runValue, runLength, state);
338 encodeRun(buffer, runValue, runLength, state);
340 // We must save the final byte, if there is one, by padding
342 if (state[0] != 0) appendEncodedByte(buffer, (byte)0, state);
344 return buffer.toString();
348 * Encode a run, possibly a degenerate run (of < 4 values).
349 * @param length The length of the run; must be > 0 && <= 0xFFFF.
351 private static final <T extends Appendable> void encodeRun(T buffer, int value, int length) {
353 for (int j=0; j<length; ++j) {
354 if (value == ESCAPE) {
355 appendInt(buffer, value);
357 appendInt(buffer, value);
361 if (length == (int) ESCAPE) {
362 if (value == (int) ESCAPE) {
363 appendInt(buffer, ESCAPE);
365 appendInt(buffer, value);
368 appendInt(buffer, ESCAPE);
369 appendInt(buffer, length);
370 appendInt(buffer, value); // Don't need to escape this value
374 private static final <T extends Appendable> void appendInt(T buffer, int value) {
376 buffer.append((char)(value >>> 16));
377 buffer.append((char)(value & 0xFFFF));
378 } catch (IOException e) {
379 throw new IllegalIcuArgumentException(e);
384 * Encode a run, possibly a degenerate run (of < 4 values).
385 * @param length The length of the run; must be > 0 && <= 0xFFFF.
387 private static final <T extends Appendable> void encodeRun(T buffer, short value, int length) {
390 for (int j=0; j<length; ++j) {
391 if (value == (int) ESCAPE)
392 buffer.append(ESCAPE);
393 buffer.append((char) value);
397 if (length == (int) ESCAPE) {
398 if (value == (int) ESCAPE) buffer.append(ESCAPE);
399 buffer.append((char) value);
402 buffer.append(ESCAPE);
403 buffer.append((char) length);
404 buffer.append((char) value); // Don't need to escape this value
406 } catch (IOException e) {
407 throw new IllegalIcuArgumentException(e);
412 * Encode a run, possibly a degenerate run (of < 4 values).
413 * @param length The length of the run; must be > 0 && <= 0xFF.
415 private static final <T extends Appendable> void encodeRun(T buffer, byte value, int length,
418 for (int j=0; j<length; ++j) {
419 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
420 appendEncodedByte(buffer, value, state);
424 if (length == ESCAPE_BYTE) {
425 if (value == ESCAPE_BYTE) appendEncodedByte(buffer, ESCAPE_BYTE, state);
426 appendEncodedByte(buffer, value, state);
429 appendEncodedByte(buffer, ESCAPE_BYTE, state);
430 appendEncodedByte(buffer, (byte)length, state);
431 appendEncodedByte(buffer, value, state); // Don't need to escape this value
436 * Append a byte to the given Appendable, packing two bytes into each
437 * character. The state parameter maintains intermediary data between
439 * @param state A two-element array, with state[0] == 0 if this is the
440 * first byte of a pair, or state[0] != 0 if this is the second byte
441 * of a pair, in which case state[1] is the first byte.
443 private static final <T extends Appendable> void appendEncodedByte(T buffer, byte value,
447 char c = (char) ((state[1] << 8) | (((int) value) & 0xFF));
455 } catch (IOException e) {
456 throw new IllegalIcuArgumentException(e);
461 * Construct an array of ints from a run-length encoded string.
463 static public final int[] RLEStringToIntArray(String s) {
464 int length = getInt(s, 0);
465 int[] array = new int[length];
468 int maxI = s.length() / 2;
469 while (ai < length && i < maxI) {
470 int c = getInt(s, i++);
478 int runValue = getInt(s, i++);
479 for (int j=0; j<runLength; ++j) {
480 array[ai++] = runValue;
489 if (ai != length || i != maxI) {
490 throw new IllegalStateException("Bad run-length encoded int array");
495 static final int getInt(String s, int i) {
496 return (((int) s.charAt(2*i)) << 16) | (int) s.charAt(2*i+1);
500 * Construct an array of shorts from a run-length encoded string.
502 static public final short[] RLEStringToShortArray(String s) {
503 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
504 short[] array = new short[length];
506 for (int i=2; i<s.length(); ++i) {
507 char c = s.charAt(i);
511 array[ai++] = (short) c;
513 int runLength = (int) c;
514 short runValue = (short) s.charAt(++i);
515 for (int j=0; j<runLength; ++j) array[ai++] = runValue;
519 array[ai++] = (short) c;
524 throw new IllegalStateException("Bad run-length encoded short array");
530 * Construct an array of shorts from a run-length encoded string.
532 static public final char[] RLEStringToCharArray(String s) {
533 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
534 char[] array = new char[length];
536 for (int i=2; i<s.length(); ++i) {
537 char c = s.charAt(i);
543 int runLength = (int) c;
544 char runValue = s.charAt(++i);
545 for (int j=0; j<runLength; ++j) array[ai++] = runValue;
554 throw new IllegalStateException("Bad run-length encoded short array");
560 * Construct an array of bytes from a run-length encoded string.
562 static public final byte[] RLEStringToByteArray(String s) {
563 int length = (((int) s.charAt(0)) << 16) | ((int) s.charAt(1));
564 byte[] array = new byte[length];
565 boolean nextChar = true;
570 for (int ai=0; ai<length; ) {
571 // This part of the loop places the next byte into the local
572 // variable 'b' each time through the loop. It keeps the
573 // current character in 'c' and uses the boolean 'nextChar'
574 // to see if we've taken both bytes out of 'c' yet.
582 b = (byte) (c & 0xFF);
586 // This part of the loop is a tiny state machine which handles
587 // the parsing of the run-length encoding. This would be simpler
588 // if we could look ahead, but we can't, so we use 'node' to
589 // move between three nodes in the state machine.
593 if (b == ESCAPE_BYTE) {
601 // We have seen one ESCAPE_BYTE; we expect either a second
602 // one, or a run length and value.
603 if (b == ESCAPE_BYTE) {
604 array[ai++] = ESCAPE_BYTE;
609 // Interpret signed byte as unsigned
610 if (runLength < 0) runLength += 0x100;
615 // We have seen an ESCAPE_BYTE and length byte. We interpret
616 // the next byte as the value to be repeated.
617 for (int j=0; j<runLength; ++j) array[ai++] = b;
624 throw new IllegalStateException("Bad run-length encoded byte array");
627 throw new IllegalStateException("Excess data in RLE byte array string");
632 static public String LINE_SEPARATOR = System.getProperty("line.separator");
635 * Format a String for representation in a source file. This includes
636 * breaking it into lines and escaping characters using octal notation
637 * when necessary (control characters and double quotes).
639 static public final String formatForSource(String s) {
640 StringBuilder buffer = new StringBuilder();
641 for (int i=0; i<s.length();) {
642 if (i > 0) buffer.append('+').append(LINE_SEPARATOR);
643 buffer.append(" \"");
645 while (i<s.length() && count<80) {
646 char c = s.charAt(i++);
647 if (c < '\u0020' || c == '"' || c == '\\') {
649 buffer.append("\\n");
651 } else if (c == '\t') {
652 buffer.append("\\t");
654 } else if (c == '\r') {
655 buffer.append("\\r");
658 // Represent control characters, backslash and double quote
659 // using octal notation; otherwise the string we form
660 // won't compile, since Unicode escape sequences are
661 // processed before tokenization.
663 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
664 buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
665 buffer.append(HEX_DIGIT[(c & 0007)]);
669 else if (c <= '\u007E') {
674 buffer.append("\\u");
675 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
676 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
677 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
678 buffer.append(HEX_DIGIT[(c & 0x000F)]);
684 return buffer.toString();
687 static final char[] HEX_DIGIT = {'0','1','2','3','4','5','6','7',
688 '8','9','A','B','C','D','E','F'};
691 * Format a String for representation in a source file. Like
692 * formatForSource but does not do line breaking.
694 static public final String format1ForSource(String s) {
695 StringBuilder buffer = new StringBuilder();
697 for (int i=0; i<s.length();) {
698 char c = s.charAt(i++);
699 if (c < '\u0020' || c == '"' || c == '\\') {
701 buffer.append("\\n");
702 } else if (c == '\t') {
703 buffer.append("\\t");
704 } else if (c == '\r') {
705 buffer.append("\\r");
707 // Represent control characters, backslash and double quote
708 // using octal notation; otherwise the string we form
709 // won't compile, since Unicode escape sequences are
710 // processed before tokenization.
712 buffer.append(HEX_DIGIT[(c & 0700) >> 6]); // HEX_DIGIT works for octal
713 buffer.append(HEX_DIGIT[(c & 0070) >> 3]);
714 buffer.append(HEX_DIGIT[(c & 0007)]);
717 else if (c <= '\u007E') {
721 buffer.append("\\u");
722 buffer.append(HEX_DIGIT[(c & 0xF000) >> 12]);
723 buffer.append(HEX_DIGIT[(c & 0x0F00) >> 8]);
724 buffer.append(HEX_DIGIT[(c & 0x00F0) >> 4]);
725 buffer.append(HEX_DIGIT[(c & 0x000F)]);
729 return buffer.toString();
733 * Convert characters outside the range U+0020 to U+007F to
734 * Unicode escapes, and convert backslash to a double backslash.
736 public static final String escape(String s) {
737 StringBuilder buf = new StringBuilder();
738 for (int i=0; i<s.length(); ) {
739 int c = Character.codePointAt(s, i);
740 i += UTF16.getCharCount(c);
741 if (c >= ' ' && c <= 0x007F) {
743 buf.append("\\\\"); // That is, "\\"
748 boolean four = c <= 0xFFFF;
749 buf.append(four ? "\\u" : "\\U");
750 buf.append(hex(c, four ? 4 : 8));
753 return buf.toString();
756 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
757 static private final char[] UNESCAPE_MAP = {
773 * Convert an escape to a 32-bit code point value. We attempt
774 * to parallel the icu4c unescapeAt() function.
775 * @param offset16 an array containing offset to the character
776 * <em>after</em> the backslash. Upon return offset16[0] will
777 * be updated to point after the escape sequence.
778 * @return character value from 0 to 10FFFF, or -1 on error.
780 public static int unescapeAt(String s, int[] offset16) {
786 int bitsPerDigit = 4;
789 boolean braces = false;
791 /* Check that offset is in range */
792 int offset = offset16[0];
793 int length = s.length();
794 if (offset < 0 || offset >= length) {
798 /* Fetch first UChar after '\\' */
799 c = Character.codePointAt(s, offset);
800 offset += UTF16.getCharCount(c);
802 /* Convert hexadecimal and octal escapes */
812 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
821 dig = UCharacter.digit(c, 8);
825 n = 1; /* Already have first octal digit */
832 while (offset < length && n < maxDig) {
833 c = UTF16.charAt(s, offset);
834 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
838 result = (result << bitsPerDigit) | dig;
839 offset += UTF16.getCharCount(c);
846 if (c != 0x7D /*}*/) {
851 if (result < 0 || result >= 0x110000) {
854 // If an escape sequence specifies a lead surrogate, see
855 // if there is a trail surrogate after it, either as an
856 // escape or as a literal. If so, join them up into a
858 if (offset < length &&
859 UTF16.isLeadSurrogate((char) result)) {
860 int ahead = offset+1;
861 c = s.charAt(offset); // [sic] get 16-bit code unit
862 if (c == '\\' && ahead < length) {
863 int o[] = new int[] { ahead };
864 c = unescapeAt(s, o);
867 if (UTF16.isTrailSurrogate((char) c)) {
869 result = UCharacterProperty.getRawSupplementary(
870 (char) result, (char) c);
873 offset16[0] = offset;
877 /* Convert C-style escapes in table */
878 for (i=0; i<UNESCAPE_MAP.length; i+=2) {
879 if (c == UNESCAPE_MAP[i]) {
880 offset16[0] = offset;
881 return UNESCAPE_MAP[i+1];
882 } else if (c < UNESCAPE_MAP[i]) {
887 /* Map \cX to control-X: X & 0x1F */
888 if (c == 'c' && offset < length) {
889 c = UTF16.charAt(s, offset);
890 offset16[0] = offset + UTF16.getCharCount(c);
894 /* If no special forms are recognized, then consider
895 * the backslash to generically escape the next character. */
896 offset16[0] = offset;
901 * Convert all escapes in a given string using unescapeAt().
902 * @exception IllegalArgumentException if an invalid escape is
905 public static String unescape(String s) {
906 StringBuilder buf = new StringBuilder();
907 int[] pos = new int[1];
908 for (int i=0; i<s.length(); ) {
909 char c = s.charAt(i++);
912 int e = unescapeAt(s, pos);
914 throw new IllegalArgumentException("Invalid escape sequence " +
915 s.substring(i-1, Math.min(i+8, s.length())));
917 buf.appendCodePoint(e);
923 return buf.toString();
927 * Convert all escapes in a given string using unescapeAt().
928 * Leave invalid escape sequences unchanged.
930 public static String unescapeLeniently(String s) {
931 StringBuilder buf = new StringBuilder();
932 int[] pos = new int[1];
933 for (int i=0; i<s.length(); ) {
934 char c = s.charAt(i++);
937 int e = unescapeAt(s, pos);
941 buf.appendCodePoint(e);
948 return buf.toString();
952 * Convert a char to 4 hex uppercase digits. E.g., hex('a') =>
955 public static String hex(long ch) {
960 * Supplies a zero-padded hex representation of an integer (without 0x)
962 static public String hex(long i, int places) {
963 if (i == Long.MIN_VALUE) return "-8000000000000000";
964 boolean negative = i < 0;
968 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
969 if (result.length() < places) {
970 result = "0000000000000000".substring(result.length(),places) + result;
979 * Convert a string to comma-separated groups of 4 hex uppercase
980 * digits. E.g., hex('ab') => "0041,0042".
982 public static String hex(CharSequence s) {
983 return hex(s, 4, ",", true, new StringBuilder()).toString();
987 * Convert a string to separated groups of hex uppercase
988 * digits. E.g., hex('ab'...) => "0041,0042". Append the output
989 * to the given Appendable.
991 public static <S extends CharSequence, U extends CharSequence, T extends Appendable> T hex(S s, int width, U separator, boolean useCodePoints, T result) {
995 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
996 cp = Character.codePointAt(s, i);
998 result.append(separator);
1000 result.append(hex(cp,width));
1003 for (int i = 0; i < s.length(); ++i) {
1005 result.append(separator);
1007 result.append(hex(s.charAt(i),width));
1011 } catch (IOException e) {
1012 throw new IllegalIcuArgumentException(e);
1016 public static String hex(byte[] o, int start, int end, String separator) {
1017 StringBuilder result = new StringBuilder();
1019 for (int i = start; i < end; ++i) {
1020 if (i != 0) result.append(separator);
1021 result.append(hex(o[i]));
1023 return result.toString();
1027 * Convert a string to comma-separated groups of 4 hex uppercase
1028 * digits. E.g., hex('ab') => "0041,0042".
1030 public static <S extends CharSequence> String hex(S s, int width, S separator) {
1031 return hex(s, width, separator, true, new StringBuilder()).toString();
1035 * Split a string into pieces based on the given divider character
1036 * @param s the string to split
1037 * @param divider the character on which to split. Occurrences of
1038 * this character are not included in the output
1039 * @param output an array to receive the substrings between
1040 * instances of divider. It must be large enough on entry to
1041 * accomodate all output. Adjacent instances of the divider
1042 * character will place empty strings into output. Before
1043 * returning, output is padded out with empty strings.
1045 public static void split(String s, char divider, String[] output) {
1049 for (i = 0; i < s.length(); ++i) {
1050 if (s.charAt(i) == divider) {
1051 output[current++] = s.substring(last,i);
1055 output[current++] = s.substring(last,i);
1056 while (current < output.length) {
1057 output[current++] = "";
1062 * Split a string into pieces based on the given divider character
1063 * @param s the string to split
1064 * @param divider the character on which to split. Occurrences of
1065 * this character are not included in the output
1066 * @return output an array to receive the substrings between
1067 * instances of divider. Adjacent instances of the divider
1068 * character will place empty strings into output.
1070 public static String[] split(String s, char divider) {
1073 ArrayList<String> output = new ArrayList<String>();
1074 for (i = 0; i < s.length(); ++i) {
1075 if (s.charAt(i) == divider) {
1076 output.add(s.substring(last,i));
1080 output.add( s.substring(last,i));
1081 return output.toArray(new String[output.size()]);
1085 * Look up a given string in a string array. Returns the index at
1086 * which the first occurrence of the string was found in the
1087 * array, or -1 if it was not found.
1088 * @param source the string to search for
1089 * @param target the array of zero or more strings in which to
1091 * @return the index of target at which source first occurs, or -1
1094 public static int lookup(String source, String[] target) {
1095 for (int i = 0; i < target.length; ++i) {
1096 if (source.equals(target[i])) return i;
1102 * Parse a single non-whitespace character 'ch', optionally
1103 * preceded by whitespace.
1104 * @param id the string to be parsed
1105 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
1106 * offset of the first character to be parsed. On output, pos[0]
1107 * is the index after the last parsed character. If the parse
1108 * fails, pos[0] will be unchanged.
1109 * @param ch the non-whitespace character to be parsed.
1110 * @return true if 'ch' is seen preceded by zero or more
1111 * whitespace characters.
1113 public static boolean parseChar(String id, int[] pos, char ch) {
1115 pos[0] = PatternProps.skipWhiteSpace(id, pos[0]);
1116 if (pos[0] == id.length() ||
1117 id.charAt(pos[0]) != ch) {
1126 * Parse a pattern string starting at offset pos. Keywords are
1127 * matched case-insensitively. Spaces may be skipped and may be
1128 * optional or required. Integer values may be parsed, and if
1129 * they are, they will be returned in the given array. If
1130 * successful, the offset of the next non-space character is
1131 * returned. On failure, -1 is returned.
1132 * @param pattern must only contain lowercase characters, which
1133 * will match their uppercase equivalents as well. A space
1134 * character matches one or more required spaces. A '~' character
1135 * matches zero or more optional spaces. A '#' character matches
1136 * an integer and stores it in parsedInts, which the caller must
1137 * ensure has enough capacity.
1138 * @param parsedInts array to receive parsed integers. Caller
1139 * must ensure that parsedInts.length is >= the number of '#'
1140 * signs in 'pattern'.
1141 * @return the position after the last character parsed, or -1 if
1144 @SuppressWarnings("fallthrough")
1145 public static int parsePattern(String rule, int pos, int limit,
1146 String pattern, int[] parsedInts) {
1147 // TODO Update this to handle surrogates
1148 int[] p = new int[1];
1149 int intCount = 0; // number of integers parsed
1150 for (int i=0; i<pattern.length(); ++i) {
1151 char cpat = pattern.charAt(i);
1158 c = rule.charAt(pos++);
1159 if (!PatternProps.isWhiteSpace(c)) {
1162 // FALL THROUGH to skipWhitespace
1164 pos = PatternProps.skipWhiteSpace(rule, pos);
1168 parsedInts[intCount++] = parseInteger(rule, p, limit);
1170 // Syntax error; failed to parse integer
1179 c = (char) UCharacter.toLowerCase(rule.charAt(pos++));
1190 * Parse a pattern string within the given Replaceable and a parsing
1191 * pattern. Characters are matched literally and case-sensitively
1192 * except for the following special characters:
1194 * ~ zero or more Pattern_White_Space chars
1196 * If end of pattern is reached with all matches along the way,
1197 * pos is advanced to the first unparsed index and returned.
1198 * Otherwise -1 is returned.
1199 * @param pat pattern that controls parsing
1200 * @param text text to be parsed, starting at index
1201 * @param index offset to first character to parse
1202 * @param limit offset after last character to parse
1203 * @return index after last parsed character, or -1 on parse failure.
1205 public static int parsePattern(String pat,
1211 // empty pattern matches immediately
1212 if (ipat == pat.length()) {
1216 int cpat = Character.codePointAt(pat, ipat);
1218 while (index < limit) {
1219 int c = text.char32At(index);
1223 if (PatternProps.isWhiteSpace(c)) {
1224 index += UTF16.getCharCount(c);
1227 if (++ipat == pat.length()) {
1228 return index; // success; c unparsed
1230 // fall thru; process c again with next cpat
1235 else if (c == cpat) {
1236 int n = UTF16.getCharCount(c);
1239 if (ipat == pat.length()) {
1240 return index; // success; c parsed
1242 // fall thru; get next cpat
1245 // match failure of literal
1250 cpat = UTF16.charAt(pat, ipat);
1253 return -1; // text ended before end of pat
1257 * Parse an integer at pos, either of the form \d+ or of the form
1258 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
1260 * @param pos INPUT-OUTPUT parameter. On input, the first
1261 * character to parse. On output, the character after the last
1264 public static int parseInteger(String rule, int[] pos, int limit) {
1270 if (rule.regionMatches(true, p, "0x", 0, 2)) {
1273 } else if (p < limit && rule.charAt(p) == '0') {
1280 int d = UCharacter.digit(rule.charAt(p++), radix);
1286 int v = (value * radix) + d;
1288 // If there are too many input digits, at some point
1289 // the value will go negative, e.g., if we have seen
1290 // "0x8000000" already and there is another '0', when
1291 // we parse the next 0 the value will go negative.
1303 * Parse a Unicode identifier from the given string at the given
1304 * position. Return the identifier, or null if there is no
1306 * @param str the string to parse
1307 * @param pos INPUT-OUPUT parameter. On INPUT, pos[0] is the
1308 * first character to examine. It must be less than str.length(),
1309 * and it must not point to a whitespace character. That is, must
1310 * have pos[0] < str.length(). On
1311 * OUTPUT, the position after the last parsed character.
1312 * @return the Unicode identifier, or null if there is no valid
1313 * identifier at pos[0].
1315 public static String parseUnicodeIdentifier(String str, int[] pos) {
1316 // assert(pos[0] < str.length());
1317 StringBuilder buf = new StringBuilder();
1319 while (p < str.length()) {
1320 int ch = Character.codePointAt(str, p);
1321 if (buf.length() == 0) {
1322 if (UCharacter.isUnicodeIdentifierStart(ch)) {
1323 buf.appendCodePoint(ch);
1328 if (UCharacter.isUnicodeIdentifierPart(ch)) {
1329 buf.appendCodePoint(ch);
1334 p += UTF16.getCharCount(ch);
1337 return buf.toString();
1340 static final char DIGITS[] = {
1341 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
1342 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
1343 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
1344 'U', 'V', 'W', 'X', 'Y', 'Z'
1348 * Append the digits of a positive integer to the given
1349 * <code>Appendable</code> in the given radix. This is
1350 * done recursively since it is easiest to generate the low-
1351 * order digit first, but it must be appended last.
1353 * @param result is the <code>Appendable</code> to append to
1354 * @param n is the positive integer
1355 * @param radix is the radix, from 2 to 36 inclusive
1356 * @param minDigits is the minimum number of digits to append.
1358 private static <T extends Appendable> void recursiveAppendNumber(T result, int n,
1359 int radix, int minDigits)
1362 int digit = n % radix;
1364 if (n >= radix || minDigits > 1) {
1365 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
1367 result.append(DIGITS[digit]);
1368 } catch (IOException e) {
1369 throw new IllegalIcuArgumentException(e);
1374 * Append a number to the given Appendable in the given radix.
1375 * Standard digits '0'-'9' are used and letters 'A'-'Z' for
1376 * radices 11 through 36.
1377 * @param result the digits of the number are appended here
1378 * @param n the number to be converted to digits; may be negative.
1379 * If negative, a '-' is prepended to the digits.
1380 * @param radix a radix from 2 to 36 inclusive.
1381 * @param minDigits the minimum number of digits, not including
1382 * any '-', to produce. Values less than 2 have no effect. One
1383 * digit is always emitted regardless of this parameter.
1384 * @return a reference to result
1386 public static <T extends Appendable> T appendNumber(T result, int n,
1387 int radix, int minDigits)
1390 if (radix < 2 || radix > 36) {
1391 throw new IllegalArgumentException("Illegal radix " + radix);
1402 recursiveAppendNumber(result, abs, radix, minDigits);
1405 } catch (IOException e) {
1406 throw new IllegalIcuArgumentException(e);
1412 * Parse an unsigned 31-bit integer at the given offset. Use
1413 * UCharacter.digit() to parse individual characters into digits.
1414 * @param text the text to be parsed
1415 * @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
1416 * offset within text at which to start parsing; it should point
1417 * to a valid digit. On exit, pos[0] is the offset after the last
1418 * parsed character. If the parse failed, it will be unchanged on
1419 * exit. Must be >= 0 on entry.
1420 * @param radix the radix in which to parse; must be >= 2 and <=
1422 * @return a non-negative parsed number, or -1 upon parse failure.
1423 * Parse fails if there are no digits, that is, if pos[0] does not
1424 * point to a valid digit on entry, or if the number to be parsed
1425 * does not fit into a 31-bit unsigned integer.
1427 public static int parseNumber(String text, int[] pos, int radix) {
1428 // assert(pos[0] >= 0);
1429 // assert(radix >= 2);
1430 // assert(radix <= 36);
1433 while (p < text.length()) {
1434 int ch = Character.codePointAt(text, p);
1435 int d = UCharacter.digit(ch, radix);
1440 // ASSUME that when a 32-bit integer overflows it becomes
1441 // negative. E.g., 214748364 * 10 + 8 => negative value.
1455 * Return true if the character is NOT printable ASCII. The tab,
1456 * newline and linefeed characters are considered unprintable.
1458 public static boolean isUnprintable(int c) {
1459 //0x20 = 32 and 0x7E = 126
1460 return !(c >= 0x20 && c <= 0x7E);
1464 * Escape unprintable characters using <backslash>uxxxx notation
1465 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
1466 * above. If the character is printable ASCII, then do nothing
1467 * and return FALSE. Otherwise, append the escaped notation and
1470 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
1472 if (isUnprintable(c)) {
1473 result.append('\\');
1474 if ((c & ~0xFFFF) != 0) {
1476 result.append(DIGITS[0xF&(c>>28)]);
1477 result.append(DIGITS[0xF&(c>>24)]);
1478 result.append(DIGITS[0xF&(c>>20)]);
1479 result.append(DIGITS[0xF&(c>>16)]);
1483 result.append(DIGITS[0xF&(c>>12)]);
1484 result.append(DIGITS[0xF&(c>>8)]);
1485 result.append(DIGITS[0xF&(c>>4)]);
1486 result.append(DIGITS[0xF&c]);
1490 } catch (IOException e) {
1491 throw new IllegalIcuArgumentException(e);
1496 * Returns the index of the first character in a set, ignoring quoted text.
1497 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
1498 * found by a search for "h". Unlike String.indexOf(), this method searches
1499 * not for a single character, but for any character of the string
1500 * <code>setOfChars</code>.
1501 * @param text text to be searched
1502 * @param start the beginning index, inclusive; <code>0 <= start
1504 * @param limit the ending index, exclusive; <code>start <= limit
1505 * <= text.length()</code>.
1506 * @param setOfChars string with one or more distinct characters
1507 * @return Offset of the first character in <code>setOfChars</code>
1508 * found, or -1 if not found.
1509 * @see String#indexOf
1511 public static int quotedIndexOf(String text, int start, int limit,
1512 String setOfChars) {
1513 for (int i=start; i<limit; ++i) {
1514 char c = text.charAt(i);
1515 if (c == BACKSLASH) {
1517 } else if (c == APOSTROPHE) {
1519 && text.charAt(i) != APOSTROPHE) {}
1520 } else if (setOfChars.indexOf(c) >= 0) {
1528 * Append a character to a rule that is being built up. To flush
1529 * the quoteBuf to rule, make one final call with isLiteral == true.
1530 * If there is no final character, pass in (int)-1 as c.
1531 * @param rule the string to append the character to
1532 * @param c the character to append, or (int)-1 if none.
1533 * @param isLiteral if true, then the given character should not be
1534 * quoted or escaped. Usually this means it is a syntactic element
1536 * @param escapeUnprintable if true, then unprintable characters
1537 * should be escaped using escapeUnprintable(). These escapes will
1538 * appear outside of quotes.
1539 * @param quoteBuf a buffer which is used to build up quoted
1540 * substrings. The caller should initially supply an empty buffer,
1541 * and thereafter should not modify the buffer. The buffer should be
1542 * cleared out by, at the end, calling this method with a literal
1543 * character (which may be -1).
1545 public static void appendToRule(StringBuffer rule,
1548 boolean escapeUnprintable,
1549 StringBuffer quoteBuf) {
1550 // If we are escaping unprintables, then escape them outside
1551 // quotes. \\u and \\U are not recognized within quotes. The same
1552 // logic applies to literals, but literals are never escaped.
1554 (escapeUnprintable && Utility.isUnprintable(c))) {
1555 if (quoteBuf.length() > 0) {
1556 // We prefer backslash APOSTROPHE to double APOSTROPHE
1557 // (more readable, less similar to ") so if there are
1558 // double APOSTROPHEs at the ends, we pull them outside
1561 // If the first thing in the quoteBuf is APOSTROPHE
1562 // (doubled) then pull it out.
1563 while (quoteBuf.length() >= 2 &&
1564 quoteBuf.charAt(0) == APOSTROPHE &&
1565 quoteBuf.charAt(1) == APOSTROPHE) {
1566 rule.append(BACKSLASH).append(APOSTROPHE);
1567 quoteBuf.delete(0, 2);
1569 // If the last thing in the quoteBuf is APOSTROPHE
1570 // (doubled) then remove and count it and add it after.
1571 int trailingCount = 0;
1572 while (quoteBuf.length() >= 2 &&
1573 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
1574 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
1575 quoteBuf.setLength(quoteBuf.length()-2);
1578 if (quoteBuf.length() > 0) {
1579 rule.append(APOSTROPHE);
1580 rule.append(quoteBuf);
1581 rule.append(APOSTROPHE);
1582 quoteBuf.setLength(0);
1584 while (trailingCount-- > 0) {
1585 rule.append(BACKSLASH).append(APOSTROPHE);
1589 /* Since spaces are ignored during parsing, they are
1590 * emitted only for readability. We emit one here
1591 * only if there isn't already one at the end of the
1595 int len = rule.length();
1596 if (len > 0 && rule.charAt(len-1) != ' ') {
1599 } else if (!escapeUnprintable || !Utility.escapeUnprintable(rule, c)) {
1600 rule.appendCodePoint(c);
1605 // Escape ' and '\' and don't begin a quote just for them
1606 else if (quoteBuf.length() == 0 &&
1607 (c == APOSTROPHE || c == BACKSLASH)) {
1608 rule.append(BACKSLASH).append((char)c);
1611 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
1612 // whitespace need quoting. Also append stuff to quotes if we are
1613 // building up a quoted substring already.
1614 else if (quoteBuf.length() > 0 ||
1615 (c >= 0x0021 && c <= 0x007E &&
1616 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
1617 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
1618 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
1619 PatternProps.isWhiteSpace(c)) {
1620 quoteBuf.appendCodePoint(c);
1621 // Double ' within a quote
1622 if (c == APOSTROPHE) {
1623 quoteBuf.append((char)c);
1627 // Otherwise just append
1629 rule.appendCodePoint(c);
1634 * Append the given string to the rule. Calls the single-character
1635 * version of appendToRule for each character.
1637 public static void appendToRule(StringBuffer rule,
1640 boolean escapeUnprintable,
1641 StringBuffer quoteBuf) {
1642 for (int i=0; i<text.length(); ++i) {
1643 // Okay to process in 16-bit code units here
1644 appendToRule(rule, text.charAt(i), isLiteral, escapeUnprintable, quoteBuf);
1649 * Given a matcher reference, which may be null, append its
1650 * pattern as a literal to the given rule.
1652 public static void appendToRule(StringBuffer rule,
1653 UnicodeMatcher matcher,
1654 boolean escapeUnprintable,
1655 StringBuffer quoteBuf) {
1656 if (matcher != null) {
1657 appendToRule(rule, matcher.toPattern(escapeUnprintable),
1658 true, escapeUnprintable, quoteBuf);
1663 * Compares 2 unsigned integers
1664 * @param source 32 bit unsigned integer
1665 * @param target 32 bit unsigned integer
1666 * @return 0 if equals, 1 if source is greater than target and -1
1669 public static final int compareUnsigned(int source, int target)
1671 source += MAGIC_UNSIGNED;
1672 target += MAGIC_UNSIGNED;
1673 if (source < target) {
1676 else if (source > target) {
1683 * Find the highest bit in a positive integer. This is done
1684 * by doing a binary search through the bits.
1686 * @param n is the integer
1688 * @return the bit number of the highest bit, with 0 being
1689 * the low order bit, or -1 if <code>n</code> is not positive
1691 public static final byte highBit(int n)
1727 * Utility method to take a int[] containing codepoints and return
1728 * a string representation with code units.
1730 public static String valueOf(int[]source){
1731 // TODO: Investigate why this method is not on UTF16 class
1732 StringBuilder result = new StringBuilder(source.length);
1733 for(int i=0; i<source.length; i++){
1734 result.appendCodePoint(source[i]);
1736 return result.toString();
1741 * Utility to duplicate a string count times
1742 * @param s String to be duplicated.
1743 * @param count Number of times to duplicate a string.
1745 public static String repeat(String s, int count) {
1746 if (count <= 0) return "";
1747 if (count == 1) return s;
1748 StringBuilder result = new StringBuilder();
1749 for (int i = 0; i < count; ++i) {
1752 return result.toString();
1755 public static String[] splitString(String src, String target) {
1756 return src.split("\\Q" + target + "\\E");
1760 * Split the string at runs of ascii whitespace characters.
1762 public static String[] splitWhitespace(String src) {
1763 return src.split("\\s+");
1767 * Parse a list of hex numbers and return a string
1768 * @param string String of hex numbers.
1769 * @param minLength Minimal length.
1770 * @param separator Separator.
1771 * @return A string from hex numbers.
1773 public static String fromHex(String string, int minLength, String separator) {
1774 return fromHex(string, minLength, Pattern.compile(separator != null ? separator : "\\s+"));
1778 * Parse a list of hex numbers and return a string
1779 * @param string String of hex numbers.
1780 * @param minLength Minimal length.
1781 * @param separator Separator.
1782 * @return A string from hex numbers.
1784 public static String fromHex(String string, int minLength, Pattern separator) {
1785 StringBuilder buffer = new StringBuilder();
1786 String[] parts = separator.split(string);
1787 for (String part : parts) {
1788 if (part.length() < minLength) {
1789 throw new IllegalArgumentException("code point too short: " + part);
1791 int cp = Integer.parseInt(part, 16);
1792 buffer.appendCodePoint(cp);
1794 return buffer.toString();
1798 * Return a fallback class loader for loading ICU resource
1799 * @return A class loader
1801 public static ClassLoader getFallbackClassLoader() {
1802 ClassLoader cl = Thread.currentThread().getContextClassLoader();
1804 cl = ClassLoader.getSystemClassLoader();
1806 //TODO It is not guaranteed that we can get non-null class loader
1807 // by the Java specification.
1808 throw new RuntimeException("No accessible class loader is available.");