2 **********************************************************************
\r
3 * Copyright (c) 2001-2010, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 **********************************************************************
\r
6 * Date Name Description
\r
7 * 11/19/2001 aliu Creation.
\r
8 **********************************************************************
\r
10 package com.ibm.icu.text;
\r
11 import com.ibm.icu.lang.UCharacter;
\r
14 * A transliterator that converts Unicode escape forms to the
\r
15 * characters they represent. Escape forms have a prefix, a suffix, a
\r
16 * radix, and minimum and maximum digit counts.
\r
18 * <p>This class is package private. It registers several standard
\r
19 * variants with the system which are then accessed via their IDs.
\r
23 class UnescapeTransliterator extends Transliterator {
\r
26 * The encoded pattern specification. The pattern consists of
\r
27 * zero or more forms. Each form consists of a prefix, suffix,
\r
28 * radix, minimum digit count, and maximum digit count. These
\r
29 * values are stored as a five character header. That is, their
\r
30 * numeric values are cast to 16-bit characters and stored in the
\r
31 * string. Following these five characters, the prefix
\r
32 * characters, then suffix characters are stored. Each form thus
\r
33 * takes n+5 characters, where n is the total length of the prefix
\r
34 * and suffix. The end is marked by a header of length one
\r
35 * consisting of the character END.
\r
37 private char spec[];
\r
40 * Special character marking the end of the spec[] array.
\r
42 private static final char END = 0xFFFF;
\r
45 * Registers standard variants with the system. Called by
\r
46 * Transliterator during initialization.
\r
48 static void register() {
\r
49 // Unicode: "U+10FFFF" hex, min=4, max=6
\r
50 Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
\r
51 public Transliterator getInstance(String ID) {
\r
52 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
\r
53 2, 0, 16, 4, 6, 'U', '+',
\r
59 // Java: "\\uFFFF" hex, min=4, max=4
\r
60 Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
\r
61 public Transliterator getInstance(String ID) {
\r
62 return new UnescapeTransliterator("Hex-Any/Java", new char[] {
\r
63 2, 0, 16, 4, 4, '\\', 'u',
\r
69 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
\r
70 Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
\r
71 public Transliterator getInstance(String ID) {
\r
72 return new UnescapeTransliterator("Hex-Any/C", new char[] {
\r
73 2, 0, 16, 4, 4, '\\', 'u',
\r
74 2, 0, 16, 8, 8, '\\', 'U',
\r
80 // XML: "" hex, min=1, max=6
\r
81 Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
\r
82 public Transliterator getInstance(String ID) {
\r
83 return new UnescapeTransliterator("Hex-Any/XML", new char[] {
\r
84 3, 1, 16, 1, 6, '&', '#', 'x', ';',
\r
90 // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
\r
91 Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
\r
92 public Transliterator getInstance(String ID) {
\r
93 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
\r
94 2, 1, 10, 1, 7, '&', '#', ';',
\r
100 // Perl: "\\x{263A}" hex, min=1, max=6
\r
101 Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
\r
102 public Transliterator getInstance(String ID) {
\r
103 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
\r
104 3, 1, 16, 1, 6, '\\', 'x', '{', '}',
\r
110 // All: Java, C, Perl, XML, XML10, Unicode
\r
111 Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
\r
112 public Transliterator getInstance(String ID) {
\r
113 return new UnescapeTransliterator("Hex-Any", new char[] {
\r
114 2, 0, 16, 4, 6, 'U', '+', // Unicode
\r
115 2, 0, 16, 4, 4, '\\', 'u', // Java
\r
116 2, 0, 16, 8, 8, '\\', 'U', // C (surrogates)
\r
117 3, 1, 16, 1, 6, '&', '#', 'x', ';', // XML
\r
118 2, 1, 10, 1, 7, '&', '#', ';', // XML10
\r
119 3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
\r
127 * Package private constructor. Takes the encoded spec array.
\r
129 UnescapeTransliterator(String ID, char spec[]) {
\r
135 * Implements {@link Transliterator#handleTransliterate}.
\r
137 protected void handleTransliterate(Replaceable text,
\r
138 Position pos, boolean isIncremental) {
\r
139 int start = pos.start;
\r
140 int limit = pos.limit;
\r
144 while (start < limit) {
\r
145 // Loop over the forms in spec[]. Exit this loop when we
\r
146 // match one of the specs. Exit the outer loop if a
\r
147 // partial match is detected and isIncremental is true.
\r
148 for (j=0, ipat=0; spec[ipat] != END; ++j) {
\r
151 int prefixLen = spec[ipat++];
\r
152 int suffixLen = spec[ipat++];
\r
153 int radix = spec[ipat++];
\r
154 int minDigits = spec[ipat++];
\r
155 int maxDigits = spec[ipat++];
\r
157 // s is a copy of start that is advanced over the
\r
158 // characters as we parse them.
\r
160 boolean match = true;
\r
162 for (i=0; i<prefixLen; ++i) {
\r
165 // We've already matched a character. This is
\r
166 // a partial match, so we return if in
\r
167 // incremental mode. In non-incremental mode,
\r
168 // go to the next spec.
\r
169 if (isIncremental) {
\r
176 char c = text.charAt(s++);
\r
177 if (c != spec[ipat + i]) {
\r
185 int digitCount = 0;
\r
188 // Check for partial match in incremental mode.
\r
189 if (s > start && isIncremental) {
\r
194 int ch = text.char32At(s);
\r
195 int digit = UCharacter.digit(ch, radix);
\r
199 s += UTF16.getCharCount(ch);
\r
200 u = (u * radix) + digit;
\r
201 if (++digitCount == maxDigits) {
\r
206 match = (digitCount >= minDigits);
\r
209 for (i=0; i<suffixLen; ++i) {
\r
211 // Check for partial match in incremental mode.
\r
212 if (s > start && isIncremental) {
\r
218 char c = text.charAt(s++);
\r
219 if (c != spec[ipat + prefixLen + i]) {
\r
226 // At this point, we have a match
\r
227 String str = UTF16.valueOf(u);
\r
228 text.replace(start, s, str);
\r
229 limit -= s - start - str.length();
\r
230 // The following break statement leaves the
\r
231 // loop that is traversing the forms in
\r
232 // spec[]. We then parse the next input
\r
239 ipat += prefixLen + suffixLen;
\r
242 if (start < limit) {
\r
243 start += UTF16.getCharCount(text.char32At(start));
\r
247 pos.contextLimit += limit - pos.limit;
\r