jars/icu4j-4_4_2-src/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java

   1 /*\r
   2 **********************************************************************\r
   3 *   Copyright (c) 2001-2010, International Business Machines\r
   4 *   Corporation and others.  All Rights Reserved.\r
   5 **********************************************************************\r
   6 *   Date        Name        Description\r
   7 *   11/19/2001  aliu        Creation.\r
   8 **********************************************************************\r
   9 */\r
  10 package com.ibm.icu.text;\r
  11 import com.ibm.icu.lang.UCharacter;\r
  12 \r
  13 /**\r
  14  * A transliterator that converts Unicode escape forms to the\r
  15  * characters they represent.  Escape forms have a prefix, a suffix, a\r
  16  * radix, and minimum and maximum digit counts.\r
  17  *\r
  18  * <p>This class is package private.  It registers several standard\r
  19  * variants with the system which are then accessed via their IDs.\r
  20  *\r
  21  * @author Alan Liu\r
  22  */\r
  23 class UnescapeTransliterator extends Transliterator {\r
  24 \r
  25     /**\r
  26      * The encoded pattern specification.  The pattern consists of\r
  27      * zero or more forms.  Each form consists of a prefix, suffix,\r
  28      * radix, minimum digit count, and maximum digit count.  These\r
  29      * values are stored as a five character header.  That is, their\r
  30      * numeric values are cast to 16-bit characters and stored in the\r
  31      * string.  Following these five characters, the prefix\r
  32      * characters, then suffix characters are stored.  Each form thus\r
  33      * takes n+5 characters, where n is the total length of the prefix\r
  34      * and suffix.  The end is marked by a header of length one\r
  35      * consisting of the character END.\r
  36      */\r
  37     private char spec[];\r
  38 \r
  39     /**\r
  40      * Special character marking the end of the spec[] array.\r
  41      */\r
  42     private static final char END = 0xFFFF;\r
  43 \r
  44     /**\r
  45      * Registers standard variants with the system.  Called by\r
  46      * Transliterator during initialization.\r
  47      */\r
  48     static void register() {\r
  49         // Unicode: "U+10FFFF" hex, min=4, max=6\r
  50         Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {\r
  51             public Transliterator getInstance(String ID) {\r
  52                 return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {\r
  53                     2, 0, 16, 4, 6, 'U', '+',\r
  54                     END\r
  55                 });\r
  56             }\r
  57         });\r
  58         \r
  59         // Java: "\\uFFFF" hex, min=4, max=4\r
  60         Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {\r
  61             public Transliterator getInstance(String ID) {\r
  62                 return new UnescapeTransliterator("Hex-Any/Java", new char[] {\r
  63                     2, 0, 16, 4, 4, '\\', 'u',\r
  64                     END\r
  65                 });\r
  66             }\r
  67         });\r
  68         \r
  69         // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8\r
  70         Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {\r
  71             public Transliterator getInstance(String ID) {\r
  72                 return new UnescapeTransliterator("Hex-Any/C", new char[] {\r
  73                     2, 0, 16, 4, 4, '\\', 'u',\r
  74                     2, 0, 16, 8, 8, '\\', 'U',\r
  75                     END\r
  76                 });\r
  77             }\r
  78         });\r
  79         \r
  80         // XML: "&#x10FFFF;" hex, min=1, max=6\r
  81         Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {\r
  82             public Transliterator getInstance(String ID) {\r
  83                 return new UnescapeTransliterator("Hex-Any/XML", new char[] {\r
  84                     3, 1, 16, 1, 6, '&', '#', 'x', ';',\r
  85                     END\r
  86                 });\r
  87             }\r
  88         });\r
  89 \r
  90         // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")\r
  91         Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {\r
  92             public Transliterator getInstance(String ID) {\r
  93                 return new UnescapeTransliterator("Hex-Any/XML10", new char[] {\r
  94                     2, 1, 10, 1, 7, '&', '#', ';',\r
  95                     END\r
  96                 });\r
  97             }\r
  98         });\r
  99 \r
 100         // Perl: "\\x{263A}" hex, min=1, max=6\r
 101         Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {\r
 102             public Transliterator getInstance(String ID) {\r
 103                 return new UnescapeTransliterator("Hex-Any/Perl", new char[] {\r
 104                     3, 1, 16, 1, 6, '\\', 'x', '{', '}',\r
 105                     END\r
 106                 });\r
 107             }\r
 108         });\r
 109 \r
 110         // All: Java, C, Perl, XML, XML10, Unicode\r
 111         Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {\r
 112             public Transliterator getInstance(String ID) {\r
 113                 return new UnescapeTransliterator("Hex-Any", new char[] {\r
 114                     2, 0, 16, 4, 6, 'U', '+',            // Unicode\r
 115                     2, 0, 16, 4, 4, '\\', 'u',           // Java\r
 116                     2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)\r
 117                     3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML\r
 118                     2, 1, 10, 1, 7, '&', '#', ';',       // XML10\r
 119                     3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl\r
 120                     END\r
 121                 });\r
 122             }\r
 123         });\r
 124     }\r
 125 \r
 126     /**\r
 127      * Package private constructor.  Takes the encoded spec array.\r
 128      */\r
 129     UnescapeTransliterator(String ID, char spec[]) {\r
 130         super(ID, null);\r
 131         this.spec = spec;\r
 132     }\r
 133 \r
 134     /**\r
 135      * Implements {@link Transliterator#handleTransliterate}.\r
 136      */\r
 137     protected void handleTransliterate(Replaceable text,\r
 138                                        Position pos, boolean isIncremental) {\r
 139         int start = pos.start;\r
 140         int limit = pos.limit;\r
 141         int i, j, ipat;\r
 142 \r
 143       loop:\r
 144         while (start < limit) {\r
 145             // Loop over the forms in spec[].  Exit this loop when we\r
 146             // match one of the specs.  Exit the outer loop if a\r
 147             // partial match is detected and isIncremental is true.\r
 148             for (j=0, ipat=0; spec[ipat] != END; ++j) {\r
 149 \r
 150                 // Read the header\r
 151                 int prefixLen = spec[ipat++];\r
 152                 int suffixLen = spec[ipat++];\r
 153                 int radix     = spec[ipat++];\r
 154                 int minDigits = spec[ipat++];\r
 155                 int maxDigits = spec[ipat++];\r
 156 \r
 157                 // s is a copy of start that is advanced over the\r
 158                 // characters as we parse them.\r
 159                 int s = start;\r
 160                 boolean match = true;\r
 161 \r
 162                 for (i=0; i<prefixLen; ++i) {\r
 163                     if (s >= limit) {\r
 164                         if (i > 0) {\r
 165                             // We've already matched a character.  This is\r
 166                             // a partial match, so we return if in\r
 167                             // incremental mode.  In non-incremental mode,\r
 168                             // go to the next spec.\r
 169                             if (isIncremental) {\r
 170                                 break loop;\r
 171                             }\r
 172                             match = false;\r
 173                             break;\r
 174                         }\r
 175                     }\r
 176                     char c = text.charAt(s++);\r
 177                     if (c != spec[ipat + i]) {\r
 178                         match = false;\r
 179                         break;\r
 180                     }\r
 181                 }\r
 182 \r
 183                 if (match) {\r
 184                     int u = 0;\r
 185                     int digitCount = 0;\r
 186                     for (;;) {\r
 187                         if (s >= limit) {\r
 188                             // Check for partial match in incremental mode.\r
 189                             if (s > start && isIncremental) {\r
 190                                 break loop;\r
 191                             }\r
 192                             break;\r
 193                         }\r
 194                         int ch = text.char32At(s);\r
 195                         int digit = UCharacter.digit(ch, radix);\r
 196                         if (digit < 0) {\r
 197                             break;\r
 198                         }\r
 199                         s += UTF16.getCharCount(ch);\r
 200                         u = (u * radix) + digit;\r
 201                         if (++digitCount == maxDigits) {\r
 202                             break;\r
 203                         }\r
 204                     }\r
 205 \r
 206                     match = (digitCount >= minDigits);\r
 207 \r
 208                     if (match) {\r
 209                         for (i=0; i<suffixLen; ++i) {\r
 210                             if (s >= limit) {\r
 211                                 // Check for partial match in incremental mode.\r
 212                                 if (s > start && isIncremental) {\r
 213                                     break loop;\r
 214                                 }\r
 215                                 match = false;\r
 216                                 break;\r
 217                             }\r
 218                             char c = text.charAt(s++);\r
 219                             if (c != spec[ipat + prefixLen + i]) {\r
 220                                 match = false;\r
 221                                 break;\r
 222                             }\r
 223                         }\r
 224 \r
 225                         if (match) {\r
 226                             // At this point, we have a match\r
 227                             String str = UTF16.valueOf(u);\r
 228                             text.replace(start, s, str);\r
 229                             limit -= s - start - str.length();\r
 230                             // The following break statement leaves the\r
 231                             // loop that is traversing the forms in\r
 232                             // spec[].  We then parse the next input\r
 233                             // character.\r
 234                             break;\r
 235                         }\r
 236                     }\r
 237                 }\r
 238 \r
 239                 ipat += prefixLen + suffixLen;\r
 240             }\r
 241 \r
 242             if (start < limit) {\r
 243                 start += UTF16.getCharCount(text.char32At(start));\r
 244             }\r
 245         }\r
 246 \r
 247         pos.contextLimit += limit - pos.limit;\r
 248         pos.limit = limit;\r
 249         pos.start = start;\r
 250     }\r
 251 }\r