jars/icu4j-52_1/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2001-2011, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   11/19/2001  aliu        Creation.
   8 **********************************************************************
   9 */
  10 package com.ibm.icu.text;
  11 import com.ibm.icu.impl.Utility;
  12 import com.ibm.icu.lang.UCharacter;
  13
  14 /**
  15  * A transliterator that converts Unicode escape forms to the
  16  * characters they represent.  Escape forms have a prefix, a suffix, a
  17  * radix, and minimum and maximum digit counts.
  18  *
  19  * <p>This class is package private.  It registers several standard
  20  * variants with the system which are then accessed via their IDs.
  21  *
  22  * @author Alan Liu
  23  */
  24 class UnescapeTransliterator extends Transliterator {
  25
  26     /**
  27      * The encoded pattern specification.  The pattern consists of
  28      * zero or more forms.  Each form consists of a prefix, suffix,
  29      * radix, minimum digit count, and maximum digit count.  These
  30      * values are stored as a five character header.  That is, their
  31      * numeric values are cast to 16-bit characters and stored in the
  32      * string.  Following these five characters, the prefix
  33      * characters, then suffix characters are stored.  Each form thus
  34      * takes n+5 characters, where n is the total length of the prefix
  35      * and suffix.  The end is marked by a header of length one
  36      * consisting of the character END.
  37      */
  38     private char spec[];
  39
  40     /**
  41      * Special character marking the end of the spec[] array.
  42      */
  43     private static final char END = 0xFFFF;
  44
  45     /**
  46      * Registers standard variants with the system.  Called by
  47      * Transliterator during initialization.
  48      */
  49     static void register() {
  50         // Unicode: "U+10FFFF" hex, min=4, max=6
  51         Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
  52             public Transliterator getInstance(String ID) {
  53                 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
  54                     2, 0, 16, 4, 6, 'U', '+',
  55                     END
  56                 });
  57             }
  58         });
  59
  60         // Java: "\\uFFFF" hex, min=4, max=4
  61         Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
  62             public Transliterator getInstance(String ID) {
  63                 return new UnescapeTransliterator("Hex-Any/Java", new char[] {
  64                     2, 0, 16, 4, 4, '\\', 'u',
  65                     END
  66                 });
  67             }
  68         });
  69
  70         // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
  71         Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
  72             public Transliterator getInstance(String ID) {
  73                 return new UnescapeTransliterator("Hex-Any/C", new char[] {
  74                     2, 0, 16, 4, 4, '\\', 'u',
  75                     2, 0, 16, 8, 8, '\\', 'U',
  76                     END
  77                 });
  78             }
  79         });
  80
  81         // XML: "&#x10FFFF;" hex, min=1, max=6
  82         Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
  83             public Transliterator getInstance(String ID) {
  84                 return new UnescapeTransliterator("Hex-Any/XML", new char[] {
  85                     3, 1, 16, 1, 6, '&', '#', 'x', ';',
  86                     END
  87                 });
  88             }
  89         });
  90
  91         // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
  92         Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
  93             public Transliterator getInstance(String ID) {
  94                 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
  95                     2, 1, 10, 1, 7, '&', '#', ';',
  96                     END
  97                 });
  98             }
  99         });
 100
 101         // Perl: "\\x{263A}" hex, min=1, max=6
 102         Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
 103             public Transliterator getInstance(String ID) {
 104                 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
 105                     3, 1, 16, 1, 6, '\\', 'x', '{', '}',
 106                     END
 107                 });
 108             }
 109         });
 110
 111         // All: Java, C, Perl, XML, XML10, Unicode
 112         Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
 113             public Transliterator getInstance(String ID) {
 114                 return new UnescapeTransliterator("Hex-Any", new char[] {
 115                     2, 0, 16, 4, 6, 'U', '+',            // Unicode
 116                     2, 0, 16, 4, 4, '\\', 'u',           // Java
 117                     2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)
 118                     3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML
 119                     2, 1, 10, 1, 7, '&', '#', ';',       // XML10
 120                     3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
 121                     END
 122                 });
 123             }
 124         });
 125     }
 126
 127     /**
 128      * Package private constructor.  Takes the encoded spec array.
 129      */
 130     UnescapeTransliterator(String ID, char spec[]) {
 131         super(ID, null);
 132         this.spec = spec;
 133     }
 134
 135     /**
 136      * Implements {@link Transliterator#handleTransliterate}.
 137      */
 138     protected void handleTransliterate(Replaceable text,
 139                                        Position pos, boolean isIncremental) {
 140         int start = pos.start;
 141         int limit = pos.limit;
 142         int i, ipat;
 143
 144       loop:
 145         while (start < limit) {
 146             // Loop over the forms in spec[].  Exit this loop when we
 147             // match one of the specs.  Exit the outer loop if a
 148             // partial match is detected and isIncremental is true.
 149             for (ipat = 0; spec[ipat] != END;) {
 150
 151                 // Read the header
 152                 int prefixLen = spec[ipat++];
 153                 int suffixLen = spec[ipat++];
 154                 int radix     = spec[ipat++];
 155                 int minDigits = spec[ipat++];
 156                 int maxDigits = spec[ipat++];
 157
 158                 // s is a copy of start that is advanced over the
 159                 // characters as we parse them.
 160                 int s = start;
 161                 boolean match = true;
 162
 163                 for (i=0; i<prefixLen; ++i) {
 164                     if (s >= limit) {
 165                         if (i > 0) {
 166                             // We've already matched a character.  This is
 167                             // a partial match, so we return if in
 168                             // incremental mode.  In non-incremental mode,
 169                             // go to the next spec.
 170                             if (isIncremental) {
 171                                 break loop;
 172                             }
 173                             match = false;
 174                             break;
 175                         }
 176                     }
 177                     char c = text.charAt(s++);
 178                     if (c != spec[ipat + i]) {
 179                         match = false;
 180                         break;
 181                     }
 182                 }
 183
 184                 if (match) {
 185                     int u = 0;
 186                     int digitCount = 0;
 187                     for (;;) {
 188                         if (s >= limit) {
 189                             // Check for partial match in incremental mode.
 190                             if (s > start && isIncremental) {
 191                                 break loop;
 192                             }
 193                             break;
 194                         }
 195                         int ch = text.char32At(s);
 196                         int digit = UCharacter.digit(ch, radix);
 197                         if (digit < 0) {
 198                             break;
 199                         }
 200                         s += UTF16.getCharCount(ch);
 201                         u = (u * radix) + digit;
 202                         if (++digitCount == maxDigits) {
 203                             break;
 204                         }
 205                     }
 206
 207                     match = (digitCount >= minDigits);
 208
 209                     if (match) {
 210                         for (i=0; i<suffixLen; ++i) {
 211                             if (s >= limit) {
 212                                 // Check for partial match in incremental mode.
 213                                 if (s > start && isIncremental) {
 214                                     break loop;
 215                                 }
 216                                 match = false;
 217                                 break;
 218                             }
 219                             char c = text.charAt(s++);
 220                             if (c != spec[ipat + prefixLen + i]) {
 221                                 match = false;
 222                                 break;
 223                             }
 224                         }
 225
 226                         if (match) {
 227                             // At this point, we have a match
 228                             String str = UTF16.valueOf(u);
 229                             text.replace(start, s, str);
 230                             limit -= s - start - str.length();
 231                             // The following break statement leaves the
 232                             // loop that is traversing the forms in
 233                             // spec[].  We then parse the next input
 234                             // character.
 235                             break;
 236                         }
 237                     }
 238                 }
 239
 240                 ipat += prefixLen + suffixLen;
 241             }
 242
 243             if (start < limit) {
 244                 start += UTF16.getCharCount(text.char32At(start));
 245             }
 246         }
 247
 248         pos.contextLimit += limit - pos.limit;
 249         pos.limit = limit;
 250         pos.start = start;
 251     }
 252
 253     /* (non-Javadoc)
 254      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
 255      */
 256     @Override
 257     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
 258         // Each form consists of a prefix, suffix,
 259         // * radix, minimum digit count, and maximum digit count.  These
 260         // * values are stored as a five character header. ...
 261         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
 262         UnicodeSet items = new UnicodeSet();
 263         StringBuilder buffer = new StringBuilder();
 264         for (int i = 0; spec[i] != END;) {
 265             // first 5 items are header
 266             int end = i + spec[i] + spec[i+1] + 5;
 267             int radix = spec[i+2];
 268             for (int j = 0; j < radix; ++j) {
 269                 Utility.appendNumber(buffer, j, radix, 0);
 270             }
 271             // then add the characters
 272             for (int j = i + 5; j < end; ++j) {
 273                 items.add(spec[j]);
 274             }
 275             // and go to next block
 276             i = end;
 277         }
 278         items.addAll(buffer.toString());
 279         items.retainAll(myFilter);
 280
 281         if (items.size() > 0) {
 282             sourceSet.addAll(items);
 283             targetSet.addAll(0,0x10FFFF); // assume we can produce any character
 284         }
 285     }
 286 }