2 **********************************************************************
3 * Copyright (c) 2001-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/19/2001 aliu Creation.
8 **********************************************************************
10 package com.ibm.icu.text;
11 import com.ibm.icu.impl.Utility;
14 * A transliterator that converts Unicode characters to an escape
15 * form. Examples of escape forms are "U+4E01" and "".
16 * Escape forms have a prefix and suffix, either of which may be
17 * empty, a radix, typically 16 or 10, a minimum digit count,
18 * typically 1, 4, or 8, and a boolean that specifies whether
19 * supplemental characters are handled as 32-bit code points or as two
20 * 16-bit code units. Most escape forms handle 32-bit code points,
21 * but some, such as the Java form, intentionally break them into two
22 * surrogate pairs, for backward compatibility.
24 * <p>Some escape forms actually have two different patterns, one for
25 * BMP characters (0..FFFF) and one for supplements (>FFFF). To
26 * handle this, a second EscapeTransliterator may be defined that
27 * specifies the pattern to be produced for supplementals. An example
28 * of a form that requires this is the C form, which uses "\\uFFFF"
29 * for BMP characters and "\\U0010FFFF" for supplementals.
31 * <p>This class is package private. It registers several standard
32 * variants with the system which are then accessed via their IDs.
36 class EscapeTransliterator extends Transliterator {
39 * The prefix of the escape form; may be empty, but usually isn't.
42 private String prefix;
45 * The prefix of the escape form; often empty. May not be null.
47 private String suffix;
50 * The radix to display the number in. Typically 16 or 10. Must
51 * be in the range 2 to 36.
56 * The minimum number of digits. Typically 1, 4, or 8. Values
57 * less than 1 are equivalent to 1.
59 private int minDigits;
62 * If true, supplementals are handled as 32-bit code points. If
63 * false, they are handled as two 16-bit code units.
65 private boolean grokSupplementals;
68 * The form to be used for supplementals. If this is null then
69 * the same form is used for BMP characters and supplementals. If
70 * this is not null and if grokSupplementals is true then the
71 * prefix, suffix, radix, and minDigits of this object are used
74 private EscapeTransliterator supplementalHandler;
77 * Registers standard variants with the system. Called by
78 * Transliterator during initialization.
80 static void register() {
81 // Unicode: "U+10FFFF" hex, min=4, max=6
82 Transliterator.registerFactory("Any-Hex/Unicode", new Transliterator.Factory() {
83 public Transliterator getInstance(String ID) {
84 return new EscapeTransliterator("Any-Hex/Unicode",
85 "U+", "", 16, 4, true, null);
89 // Java: "\\uFFFF" hex, min=4, max=4
90 Transliterator.registerFactory("Any-Hex/Java", new Transliterator.Factory() {
91 public Transliterator getInstance(String ID) {
92 return new EscapeTransliterator("Any-Hex/Java",
93 "\\u", "", 16, 4, false, null);
97 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
98 Transliterator.registerFactory("Any-Hex/C", new Transliterator.Factory() {
99 public Transliterator getInstance(String ID) {
100 return new EscapeTransliterator("Any-Hex/C",
101 "\\u", "", 16, 4, true,
102 new EscapeTransliterator("", "\\U", "", 16, 8, true, null));
106 // XML: "" hex, min=1, max=6
107 Transliterator.registerFactory("Any-Hex/XML", new Transliterator.Factory() {
108 public Transliterator getInstance(String ID) {
109 return new EscapeTransliterator("Any-Hex/XML",
110 "&#x", ";", 16, 1, true, null);
114 // XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex")
115 Transliterator.registerFactory("Any-Hex/XML10", new Transliterator.Factory() {
116 public Transliterator getInstance(String ID) {
117 return new EscapeTransliterator("Any-Hex/XML10",
118 "&#", ";", 10, 1, true, null);
122 // Perl: "\\x{263A}" hex, min=1, max=6
123 Transliterator.registerFactory("Any-Hex/Perl", new Transliterator.Factory() {
124 public Transliterator getInstance(String ID) {
125 return new EscapeTransliterator("Any-Hex/Perl",
126 "\\x{", "}", 16, 1, true, null);
131 Transliterator.registerFactory("Any-Hex", new Transliterator.Factory() {
132 public Transliterator getInstance(String ID) {
133 return new EscapeTransliterator("Any-Hex",
134 "\\u", "", 16, 4, false, null);
140 * Constructs an escape transliterator with the given ID and
141 * parameters. See the class member documentation for details.
143 EscapeTransliterator(String ID, String prefix, String suffix,
144 int radix, int minDigits,
145 boolean grokSupplementals,
146 EscapeTransliterator supplementalHandler) {
148 this.prefix = prefix;
149 this.suffix = suffix;
151 this.minDigits = minDigits;
152 this.grokSupplementals = grokSupplementals;
153 this.supplementalHandler = supplementalHandler;
157 * Implements {@link Transliterator#handleTransliterate}.
159 protected void handleTransliterate(Replaceable text,
160 Position pos, boolean incremental) {
161 int start = pos.start;
162 int limit = pos.limit;
164 StringBuilder buf = new StringBuilder(prefix);
165 int prefixLen = prefix.length();
166 boolean redoPrefix = false;
168 while (start < limit) {
169 int c = grokSupplementals ? text.char32At(start) : text.charAt(start);
170 int charLen = grokSupplementals ? UTF16.getCharCount(c) : 1;
172 if ((c & 0xFFFF0000) != 0 && supplementalHandler != null) {
174 buf.append(supplementalHandler.prefix);
175 Utility.appendNumber(buf, c, supplementalHandler.radix,
176 supplementalHandler.minDigits);
177 buf.append(supplementalHandler.suffix);
185 buf.setLength(prefixLen);
187 Utility.appendNumber(buf, c, radix, minDigits);
191 text.replace(start, start + charLen, buf.toString());
192 start += buf.length();
193 limit += buf.length() - charLen;
196 pos.contextLimit += limit - pos.limit;
202 * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
205 public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
206 sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
207 for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) {
208 if (inputFilter.size() != 0) {
209 targetSet.addAll(it.prefix);
210 targetSet.addAll(it.suffix);
211 StringBuilder buffer = new StringBuilder();
212 for (int i = 0; i < it.radix; ++i) {
213 Utility.appendNumber(buffer, i, it.radix, it.minDigits);
215 targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet