2 **********************************************************************
3 * Copyright (c) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
10 package com.ibm.icu.text;
11 import com.ibm.icu.impl.Utility;
12 import com.ibm.icu.lang.UCharacter;
15 * A transliterator that converts Unicode escape forms to the
16 * characters they represent. Escape forms have a prefix, a suffix, a
17 * radix, and minimum and maximum digit counts.
19 * <p>This class is package private. It registers several standard
20 * variants with the system which are then accessed via their IDs.
24 class UnescapeTransliterator extends Transliterator {
27 * The encoded pattern specification. The pattern consists of
28 * zero or more forms. Each form consists of a prefix, suffix,
29 * radix, minimum digit count, and maximum digit count. These
30 * values are stored as a five character header. That is, their
31 * numeric values are cast to 16-bit characters and stored in the
32 * string. Following these five characters, the prefix
33 * characters, then suffix characters are stored. Each form thus
34 * takes n+5 characters, where n is the total length of the prefix
35 * and suffix. The end is marked by a header of length one
36 * consisting of the character END.
41 * Special character marking the end of the spec[] array.
43 private static final char END = 0xFFFF;
46 * Registers standard variants with the system. Called by
47 * Transliterator during initialization.
49 static void register() {
50 // Unicode: "U+10FFFF" hex, min=4, max=6
51 Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
52 public Transliterator getInstance(String ID) {
53 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
54 2, 0, 16, 4, 6, 'U', '+',
60 // Java: "\\uFFFF" hex, min=4, max=4
61 Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
62 public Transliterator getInstance(String ID) {
63 return new UnescapeTransliterator("Hex-Any/Java", new char[] {
64 2, 0, 16, 4, 4, '\\', 'u',
70 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
71 Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
72 public Transliterator getInstance(String ID) {
73 return new UnescapeTransliterator("Hex-Any/C", new char[] {
74 2, 0, 16, 4, 4, '\\', 'u',
75 2, 0, 16, 8, 8, '\\', 'U',
81 // XML: "" hex, min=1, max=6
82 Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
83 public Transliterator getInstance(String ID) {
84 return new UnescapeTransliterator("Hex-Any/XML", new char[] {
85 3, 1, 16, 1, 6, '&', '#', 'x', ';',
91 // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
92 Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
93 public Transliterator getInstance(String ID) {
94 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
95 2, 1, 10, 1, 7, '&', '#', ';',
101 // Perl: "\\x{263A}" hex, min=1, max=6
102 Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
103 public Transliterator getInstance(String ID) {
104 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
105 3, 1, 16, 1, 6, '\\', 'x', '{', '}',
111 // All: Java, C, Perl, XML, XML10, Unicode
112 Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
113 public Transliterator getInstance(String ID) {
114 return new UnescapeTransliterator("Hex-Any", new char[] {
115 2, 0, 16, 4, 6, 'U', '+', // Unicode
116 2, 0, 16, 4, 4, '\\', 'u', // Java
117 2, 0, 16, 8, 8, '\\', 'U', // C (surrogates)
118 3, 1, 16, 1, 6, '&', '#', 'x', ';', // XML
119 2, 1, 10, 1, 7, '&', '#', ';', // XML10
120 3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
128 * Package private constructor. Takes the encoded spec array.
130 UnescapeTransliterator(String ID, char spec[]) {
136 * Implements {@link Transliterator#handleTransliterate}.
138 protected void handleTransliterate(Replaceable text,
139 Position pos, boolean isIncremental) {
140 int start = pos.start;
141 int limit = pos.limit;
145 while (start < limit) {
146 // Loop over the forms in spec[]. Exit this loop when we
147 // match one of the specs. Exit the outer loop if a
148 // partial match is detected and isIncremental is true.
149 for (ipat = 0; spec[ipat] != END;) {
152 int prefixLen = spec[ipat++];
153 int suffixLen = spec[ipat++];
154 int radix = spec[ipat++];
155 int minDigits = spec[ipat++];
156 int maxDigits = spec[ipat++];
158 // s is a copy of start that is advanced over the
159 // characters as we parse them.
161 boolean match = true;
163 for (i=0; i<prefixLen; ++i) {
166 // We've already matched a character. This is
167 // a partial match, so we return if in
168 // incremental mode. In non-incremental mode,
169 // go to the next spec.
177 char c = text.charAt(s++);
178 if (c != spec[ipat + i]) {
189 // Check for partial match in incremental mode.
190 if (s > start && isIncremental) {
195 int ch = text.char32At(s);
196 int digit = UCharacter.digit(ch, radix);
200 s += UTF16.getCharCount(ch);
201 u = (u * radix) + digit;
202 if (++digitCount == maxDigits) {
207 match = (digitCount >= minDigits);
210 for (i=0; i<suffixLen; ++i) {
212 // Check for partial match in incremental mode.
213 if (s > start && isIncremental) {
219 char c = text.charAt(s++);
220 if (c != spec[ipat + prefixLen + i]) {
227 // At this point, we have a match
228 String str = UTF16.valueOf(u);
229 text.replace(start, s, str);
230 limit -= s - start - str.length();
231 // The following break statement leaves the
232 // loop that is traversing the forms in
233 // spec[]. We then parse the next input
240 ipat += prefixLen + suffixLen;
244 start += UTF16.getCharCount(text.char32At(start));
248 pos.contextLimit += limit - pos.limit;
254 * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
257 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
258 // Each form consists of a prefix, suffix,
259 // * radix, minimum digit count, and maximum digit count. These
260 // * values are stored as a five character header. ...
261 UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
262 UnicodeSet items = new UnicodeSet();
263 StringBuilder buffer = new StringBuilder();
264 for (int i = 0; spec[i] != END;) {
265 // first 5 items are header
266 int end = i + spec[i] + spec[i+1] + 5;
267 int radix = spec[i+2];
268 for (int j = 0; j < radix; ++j) {
269 Utility.appendNumber(buffer, j, radix, 0);
271 // then add the characters
272 for (int j = i + 5; j < end; ++j) {
275 // and go to next block
278 items.addAll(buffer.toString());
279 items.retainAll(myFilter);
281 if (items.size() > 0) {
282 sourceSet.addAll(items);
283 targetSet.addAll(0,0x10FFFF); // assume we can produce any character