jars/icu4j-52_1/main/classes/translit/src/com/ibm/icu/text/EscapeTransliterator.java

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/19/2001  aliu        Creation.
   8 **********************************************************************
   9 */
  10 package com.ibm.icu.text;
  11 import com.ibm.icu.impl.Utility;
  12
  13 /**
  14  * A transliterator that converts Unicode characters to an escape
  15  * form.  Examples of escape forms are "U+4E01" and "&#x10FFFF;".
  16  * Escape forms have a prefix and suffix, either of which may be
  17  * empty, a radix, typically 16 or 10, a minimum digit count,
  18  * typically 1, 4, or 8, and a boolean that specifies whether
  19  * supplemental characters are handled as 32-bit code points or as two
  20  * 16-bit code units.  Most escape forms handle 32-bit code points,
  21  * but some, such as the Java form, intentionally break them into two
  22  * surrogate pairs, for backward compatibility.
  23  *
  24  * <p>Some escape forms actually have two different patterns, one for
  25  * BMP characters (0..FFFF) and one for supplements (>FFFF).  To
  26  * handle this, a second EscapeTransliterator may be defined that
  27  * specifies the pattern to be produced for supplementals.  An example
  28  * of a form that requires this is the C form, which uses "\\uFFFF"
  29  * for BMP characters and "\\U0010FFFF" for supplementals.
  30  *
  31  * <p>This class is package private.  It registers several standard
  32  * variants with the system which are then accessed via their IDs.
  33  *
  34  * @author Alan Liu
  35  */
  36 class EscapeTransliterator extends Transliterator {
  37
  38     /**
  39      * The prefix of the escape form; may be empty, but usually isn't.
  40      * May not be null.
  41      */
  42     private String prefix;
  43
  44     /**
  45      * The prefix of the escape form; often empty.  May not be null.
  46      */
  47     private String suffix;
  48
  49     /**
  50      * The radix to display the number in.  Typically 16 or 10.  Must
  51      * be in the range 2 to 36.
  52      */
  53     private int radix;
  54
  55     /**
  56      * The minimum number of digits.  Typically 1, 4, or 8.  Values
  57      * less than 1 are equivalent to 1.
  58      */
  59     private int minDigits;
  60
  61     /**
  62      * If true, supplementals are handled as 32-bit code points.  If
  63      * false, they are handled as two 16-bit code units.
  64      */
  65     private boolean grokSupplementals;
  66
  67     /**
  68      * The form to be used for supplementals.  If this is null then
  69      * the same form is used for BMP characters and supplementals.  If
  70      * this is not null and if grokSupplementals is true then the
  71      * prefix, suffix, radix, and minDigits of this object are used
  72      * for supplementals.
  73      */
  74     private EscapeTransliterator supplementalHandler;
  75
  76     /**
  77      * Registers standard variants with the system.  Called by
  78      * Transliterator during initialization.
  79      */
  80     static void register() {
  81         // Unicode: "U+10FFFF" hex, min=4, max=6
  82         Transliterator.registerFactory("Any-Hex/Unicode", new Transliterator.Factory() {
  83             public Transliterator getInstance(String ID) {
  84                 return new EscapeTransliterator("Any-Hex/Unicode",
  85                                                 "U+", "", 16, 4, true, null);
  86             }
  87         });
  88
  89         // Java: "\\uFFFF" hex, min=4, max=4
  90         Transliterator.registerFactory("Any-Hex/Java", new Transliterator.Factory() {
  91             public Transliterator getInstance(String ID) {
  92                 return new EscapeTransliterator("Any-Hex/Java",
  93                                                 "\\u", "", 16, 4, false, null);
  94             }
  95         });
  96
  97         // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  98         Transliterator.registerFactory("Any-Hex/C", new Transliterator.Factory() {
  99             public Transliterator getInstance(String ID) {
 100                 return new EscapeTransliterator("Any-Hex/C",
 101                                                 "\\u", "", 16, 4, true,
 102                        new EscapeTransliterator("", "\\U", "", 16, 8, true, null));
 103             }
 104         });
 105
 106         // XML: "&#x10FFFF;" hex, min=1, max=6
 107         Transliterator.registerFactory("Any-Hex/XML", new Transliterator.Factory() {
 108             public Transliterator getInstance(String ID) {
 109                 return new EscapeTransliterator("Any-Hex/XML",
 110                                                 "&#x", ";", 16, 1, true, null);
 111             }
 112         });
 113
 114         // XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex")
 115         Transliterator.registerFactory("Any-Hex/XML10", new Transliterator.Factory() {
 116             public Transliterator getInstance(String ID) {
 117                 return new EscapeTransliterator("Any-Hex/XML10",
 118                                                 "&#", ";", 10, 1, true, null);
 119             }
 120         });
 121
 122         // Perl: "\\x{263A}" hex, min=1, max=6
 123         Transliterator.registerFactory("Any-Hex/Perl", new Transliterator.Factory() {
 124             public Transliterator getInstance(String ID) {
 125                 return new EscapeTransliterator("Any-Hex/Perl",
 126                                                 "\\x{", "}", 16, 1, true, null);
 127             }
 128         });
 129
 130         // Plain: "FFFF" hex, min=4, max=6
 131         Transliterator.registerFactory("Any-Hex/Plain", new Transliterator.Factory() {
 132             public Transliterator getInstance(String ID) {
 133                 return new EscapeTransliterator("Any-Hex/Plain",
 134                                                 "", "", 16, 4, true, null);
 135             }
 136         });
 137
 138         // Generic
 139         Transliterator.registerFactory("Any-Hex", new Transliterator.Factory() {
 140             public Transliterator getInstance(String ID) {
 141                 return new EscapeTransliterator("Any-Hex",
 142                                                 "\\u", "", 16, 4, false, null);
 143             }
 144         });
 145     }
 146
 147     /**
 148      * Constructs an escape transliterator with the given ID and
 149      * parameters.  See the class member documentation for details.
 150      */
 151     EscapeTransliterator(String ID, String prefix, String suffix,
 152                          int radix, int minDigits,
 153                          boolean grokSupplementals,
 154                          EscapeTransliterator supplementalHandler) {
 155         super(ID, null);
 156         this.prefix = prefix;
 157         this.suffix = suffix;
 158         this.radix = radix;
 159         this.minDigits = minDigits;
 160         this.grokSupplementals = grokSupplementals;
 161         this.supplementalHandler = supplementalHandler;
 162     }
 163
 164     /**
 165      * Implements {@link Transliterator#handleTransliterate}.
 166      */
 167     protected void handleTransliterate(Replaceable text,
 168                                        Position pos, boolean incremental) {
 169         int start = pos.start;
 170         int limit = pos.limit;
 171
 172         StringBuilder buf = new StringBuilder(prefix);
 173         int prefixLen = prefix.length();
 174         boolean redoPrefix = false;
 175
 176         while (start < limit) {
 177             int c = grokSupplementals ? text.char32At(start) : text.charAt(start);
 178             int charLen = grokSupplementals ? UTF16.getCharCount(c) : 1;
 179
 180             if ((c & 0xFFFF0000) != 0 && supplementalHandler != null) {
 181                 buf.setLength(0);
 182                 buf.append(supplementalHandler.prefix);
 183                 Utility.appendNumber(buf, c, supplementalHandler.radix,
 184                                      supplementalHandler.minDigits);
 185                 buf.append(supplementalHandler.suffix);
 186                 redoPrefix = true;
 187             } else {
 188                 if (redoPrefix) {
 189                     buf.setLength(0);
 190                     buf.append(prefix);
 191                     redoPrefix = false;
 192                 } else {
 193                     buf.setLength(prefixLen);
 194                 }
 195                 Utility.appendNumber(buf, c, radix, minDigits);
 196                 buf.append(suffix);
 197             }
 198
 199             text.replace(start, start + charLen, buf.toString());
 200             start += buf.length();
 201             limit += buf.length() - charLen;
 202         }
 203
 204         pos.contextLimit += limit - pos.limit;
 205         pos.limit = limit;
 206         pos.start = start;
 207     }
 208
 209     /* (non-Javadoc)
 210      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
 211      */
 212     @Override
 213     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
 214         sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
 215         for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) {
 216             if (inputFilter.size() != 0) {
 217                 targetSet.addAll(it.prefix);
 218                 targetSet.addAll(it.suffix);
 219                 StringBuilder buffer = new StringBuilder();
 220                 for (int i = 0; i < it.radix; ++i) {
 221                     Utility.appendNumber(buffer, i, it.radix, it.minDigits);
 222                 }
 223                 targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet
 224             }
 225         }
 226     }
 227 }