1 # Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved.
\r
2 # Regex for recognizing RFC 4646 well-formed tags
\r
3 # http://www.rfc-editor.org/rfc/rfc4646.txt
\r
4 # http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21
\r
6 # The structure requires no forward references, so it reverses the order.
\r
7 # It uses Java/Perl syntax instead of the old ABNF
\r
8 # The uppercase comments are fragments copied from RFC 4646
\r
10 # Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.
\r
12 $alpha = [a-z] ; # ALPHA
\r
13 $digit = [0-9] ; # DIGIT
\r
14 $alphanum = [a-z 0-9] ; # ALPHA / DIGIT
\r
15 $x = x ; # private use singleton
\r
16 $singleton = [a-w y-z] ; # other singleton
\r
17 $s = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-]
\r
19 # Now do the components. The structure is slightly different to allow for capturing the right components.
\r
20 # The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.
\r
22 $language = $alpha{2,8} | $alpha{2,3} $s $alpha{3};
\r
24 # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8}
\r
25 # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).
\r
27 # Note: extlang invalid in Unicode language tags
\r
29 $script = $alpha{4} ; # 4ALPHA
\r
31 $region = $alpha{2} | $digit{3} ; # 2ALPHA / 3DIGIT
\r
33 $variant = (?: $alphanum{5,8} | $digit $alphanum{3} ) ; # 5*8alphanum / (DIGIT 3alphanum)
\r
35 $extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alphanum))
\r
37 $privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))
\r
39 # Define certain grandfathered codes, since otherwise the regex is pretty useless.
\r
40 # Since these are limited, this is safe even later changes to the registry --
\r
41 # the only oddity is that it might change the type of the tag, and thus
\r
42 # the results from the capturing groups.
\r
43 # http://www.iana.org/assignments/language-subtag-registry
\r
44 # Note that these have to be compared case insensitively, requiring (?i) below.
\r
46 $grandfathered = en $s GB $s oed
\r
47 | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )
\r
48 | no $s (?: bok | nyn )
\r
49 | sgn $s (?: BE $s (?: fr | nl) | CH $s de )
\r
52 # old: | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue );
\r
53 # For well-formedness, we don't need the ones that would otherwise pass.
\r
54 # For validity, they need to be checked.
\r
56 # $grandfatheredWellFormed = (?:
\r
59 # | zh $s (?: guoyu | hakka | xiang )
\r
62 # Unicode locales: but we are shifting to a compatible form
\r
63 # $keyvalue = (?: $alphanum+ \= $alphanum+);
\r
64 # $keywords = ($keyvalue (?: \; $keyvalue)*);
\r
66 # We separate items that we want to capture as a single group
\r
68 $variantList = $variant (?: $s $variant )* ; # special for multiples
\r
69 $extensionList = $extension (?: $s $extension )* ; # special for multiples
\r
71 $langtag = (?: ( $language )
\r
72 (?: $s ( $script ) )? 40%
\r
73 (?: $s ( $region ) )? 40%
\r
74 (?: $s ( $variantList ) )? 10%
\r
75 (?: $s ( $extensionList ) )? 5%
\r
76 (?: $s ( $privateUse ) )? 5%);
\r
78 # Here is the final breakdown, with capturing groups for each of these components
\r
79 # The variants, extensions, grandfathered, and private-use may have interior '-'
\r
81 $root = (?i) # case-insensitive
\r
84 | ( $privateUse ) 5%
\r
85 | ( $grandfathered ) 5%)
\r
86 # (?: \@ $keywords )? 5%
\r