jars/icu4j-4_4_2-src/main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt

   1 # Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved.\r
   2 # Regex for recognizing RFC 4646 well-formed tags\r
   3 # http://www.rfc-editor.org/rfc/rfc4646.txt\r
   4 # http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21\r
   5 \r
   6 # The structure requires no forward references, so it reverses the order.\r
   7 # It uses Java/Perl syntax instead of the old ABNF\r
   8 # The uppercase comments are fragments copied from RFC 4646\r
   9 \r
  10 # Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.\r
  11 \r
  12 $alpha  = [a-z] ;   # ALPHA\r
  13 $digit  = [0-9] ;   # DIGIT\r
  14 $alphanum   = [a-z 0-9] ;   # ALPHA / DIGIT\r
  15 $x  = x ;   # private use singleton\r
  16 $singleton = [a-w y-z] ; # other singleton\r
  17 $s  = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-]\r
  18 \r
  19 # Now do the components. The structure is slightly different to allow for capturing the right components.\r
  20 # The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.\r
  21 \r
  22 $language   = $alpha{2,8} | $alpha{2,3} $s $alpha{3};\r
  23             \r
  24    # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA  --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} \r
  25    # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).\r
  26 \r
  27 # Note: extlang invalid in Unicode language tags\r
  28 \r
  29 $script = $alpha{4} ;   # 4ALPHA \r
  30 \r
  31 $region = $alpha{2} | $digit{3} ;    # 2ALPHA / 3DIGIT\r
  32 \r
  33 $variant    = (?: $alphanum{5,8} | $digit $alphanum{3} ) ;  # 5*8alphanum / (DIGIT 3alphanum)\r
  34 \r
  35 $extension  = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alphanum))\r
  36 \r
  37 $privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))\r
  38 \r
  39 # Define certain grandfathered codes, since otherwise the regex is pretty useless.\r
  40 # Since these are limited, this is safe even later changes to the registry --\r
  41 # the only oddity is that it might change the type of the tag, and thus\r
  42 # the results from the capturing groups.\r
  43 # http://www.iana.org/assignments/language-subtag-registry\r
  44 # Note that these have to be compared case insensitively, requiring (?i) below.\r
  45 \r
  46 $grandfathered  = en $s GB $s oed\r
  47       | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )\r
  48       | no $s (?: bok | nyn )\r
  49       | sgn $s (?: BE $s (?: fr | nl) | CH $s de )\r
  50       | zh $s min $s nan;\r
  51 \r
  52 # old:         | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue );\r
  53 # For well-formedness, we don't need the ones that would otherwise pass.\r
  54 # For validity, they need to be checked.\r
  55 \r
  56 # $grandfatheredWellFormed = (?:\r
  57 #         art $s lojban\r
  58 #     | cel $s gaulish\r
  59 #     | zh $s (?: guoyu | hakka | xiang )\r
  60 # );\r
  61 \r
  62 # Unicode locales: but we are shifting to a compatible form\r
  63 # $keyvalue = (?: $alphanum+ \= $alphanum+);\r
  64 # $keywords = ($keyvalue (?: \; $keyvalue)*);\r
  65 \r
  66 # We separate items that we want to capture as a single group\r
  67 \r
  68 $variantList   = $variant (?: $s $variant )* ; # special for multiples\r
  69 $extensionList = $extension (?: $s $extension )* ;   # special for multiples\r
  70 \r
  71 $langtag = (?: ( $language )\r
  72       (?: $s ( $script ) )? 40%\r
  73       (?: $s ( $region ) )? 40%\r
  74       (?: $s ( $variantList ) )? 10%\r
  75       (?: $s ( $extensionList ) )? 5%\r
  76       (?: $s ( $privateUse ) )? 5%);\r
  77 \r
  78 # Here is the final breakdown, with capturing groups for each of these components\r
  79 # The variants, extensions, grandfathered, and private-use may have interior '-'\r
  80  \r
  81 $root = (?i) # case-insensitive\r
  82   (?:\r
  83       $langtag 90%\r
  84     | ( $privateUse ) 5%\r
  85     | ( $grandfathered ) 5%)\r
  86 #    (?: \@ $keywords )? 5%\r
  87     ;