Skip to content

Commit

Permalink
language: allow variable number of types per key in -u- extension
Browse files Browse the repository at this point in the history
This also fixes CVE-2020-28851. This was an off-by one
error, but is fixed by handling all cases according to the spec.

These valid case seem to be not used in practice much,
if at all, but the main benefit is that it makes all valid BCP 47
language tags also valid -u extensions. Fixing the code
to handle BCP 47 results in cleaner and seemingly more
robust code.

The main difference is as follows. The old impementation
assumed a -u- extension of the form:

    <tag> "-u"  { "-" <attr> } { "-" <key> "-" <type> } [ <otherExtensions> ]

where <attr> and <type> are of length 3-8 and a <key> is of length 2.

According to the spec, though, the format is

    <tag> "-u"  { "-" <attr> } { "-" <key> { "-" <type> } } [ <otherExtensions> ]

So every key may be associated with zero or more types, instead of
exactly one.

The new code now handles this.

The language.Tag.TypeForKey method is now defined to only
return the first entry or nothing at all. This is for backwards
compatibilty reasons.

Fixes golang/go#42535

Change-Id: I23aec4e1c4d8807fc2ffc0eb3a08de2d8150219f
Reviewed-on: https://go-review.googlesource.com/c/text/+/293549
Trust: Marcel van Lohuizen <[email protected]>
Run-TryBot: Marcel van Lohuizen <[email protected]>
TryBot-Result: Go Bot <[email protected]>
Reviewed-by: Russ Cox <[email protected]>
  • Loading branch information
mpvl committed Feb 27, 2021
1 parent 8f690f2 commit e3aa4ad
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 86 deletions.
90 changes: 43 additions & 47 deletions internal/language/language.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,17 @@ func (t Tag) Extensions() []string {
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
//
// If there are multiple types associated with a key, only the first will be
// returned. If there is no type associated with a key, it returns the empty
// string.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
if _, start, end, _ := t.findTypeForKey(key); end != start {
s := t.str[start:end]
if p := strings.IndexByte(s, '-'); p >= 0 {
s = s[:p]
}
return s
}
return ""
}
Expand All @@ -329,13 +337,13 @@ func (t Tag) SetTypeForKey(key, value string) (Tag, error) {

// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4

start, sep, end, _ := t.findTypeForKey(key)
if start != sep {
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
switch {
case t.str[start-2] != '-': // has previous elements.
case end == len(t.str), // end of string
end+2 < len(t.str) && t.str[end+2] == '-': // end of extension
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
Expand Down Expand Up @@ -381,14 +389,14 @@ func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
start, sep, end, hasExt := t.findTypeForKey(key)
if start == sep {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
t.str = fmt.Sprintf("%s-%s%s", s[:sep], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
t.str = fmt.Sprintf("%s-%s%s", s[:start+3], value, s[end:])
}
}
return t, nil
Expand All @@ -399,21 +407,21 @@ func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
func (t Tag) findTypeForKey(key string) (start, sep, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
return p, p, p, false
}
s := t.str

// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
return p, p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
return len(s), len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
Expand All @@ -424,40 +432,28 @@ func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {

// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
end = p
for p++; p < len(s) && s[p] != '-'; p++ {
}
for ; p < max && s[p] != '-'; p++ {
n := p - end - 1
if n <= 2 && curKey == key {
if sep < end {
sep++
}
return start, sep, end, true
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
switch n {
case 0, // invalid string
1: // next extension
return end, end, end, true
case 2:
// next key
curKey = s[end+1 : p]
if curKey > key {
return end, end, end, true
}
return p, p, true
start = end
sep = p
}
}
}
Expand Down
14 changes: 11 additions & 3 deletions internal/language/language_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,9 @@ func TestSetTypeForKey(t *testing.T) {
{"co", "pinyin", "en-u-co-phonebk-cu-xau", "en-u-co-pinyin-cu-xau", false},
{"co", "pinyin", "en-u-co-phonebk-v-xx", "en-u-co-pinyin-v-xx", false},
{"co", "pinyin", "en-u-co-phonebk-x-x", "en-u-co-pinyin-x-x", false},
{"co", "pinyin", "en-u-co-x-x", "en-u-co-pinyin-x-x", false},
{"nu", "arabic", "en-u-co-phonebk-nu-vaai", "en-u-co-phonebk-nu-arabic", false},
{"nu", "arabic", "en-u-co-phonebk-nu", "en-u-co-phonebk-nu-arabic", false},
// add to existing -u extension
{"co", "pinyin", "en-u-ca-gregory", "en-u-ca-gregory-co-pinyin", false},
{"co", "pinyin", "en-u-ca-gregory-nu-vaai", "en-u-ca-gregory-co-pinyin-nu-vaai", false},
Expand All @@ -441,8 +443,12 @@ func TestSetTypeForKey(t *testing.T) {
{"ca", "gregory", "en-u-co-pinyin", "en-u-ca-gregory-co-pinyin", false},
// remove pair
{"co", "", "en-u-co-phonebk", "en", false},
{"co", "", "en-u-co", "en", false},
{"co", "", "en-u-co-v", "en", false},
{"co", "", "en-u-co-v-", "en", false},
{"co", "", "en-u-ca-gregory-co-phonebk", "en-u-ca-gregory", false},
{"co", "", "en-u-co-phonebk-nu-arabic", "en-u-nu-arabic", false},
{"co", "", "en-u-co-nu-arabic", "en-u-nu-arabic", false},
{"co", "", "en", "en", false},
// add -u extension
{"co", "pinyin", "en", "en-u-co-pinyin", false},
Expand Down Expand Up @@ -504,6 +510,8 @@ func TestFindKeyAndType(t *testing.T) {
{"cu", false, "en-a-va-v-va", "en-a-va"},
{"cu", false, "en-x-a", "en"},
// Tags with the -u extension.
{"nu", true, "en-u-cu-nu", "en-u-cu"},
{"cu", true, "en-u-cu-nu", "en-u"},
{"co", true, "en-u-co-standard", "standard"},
{"co", true, "yue-u-co-pinyin", "pinyin"},
{"co", true, "en-u-co-abc", "abc"},
Expand All @@ -519,9 +527,9 @@ func TestFindKeyAndType(t *testing.T) {
{"cu", true, "en-u-co-abc-def-nu-arabic", "en-u-co-abc-def"},
}
for i, tt := range tests {
start, end, hasExt := Make(tt.in).findTypeForKey(tt.key)
if start != end {
res := tt.in[start:end]
start, sep, end, hasExt := Make(tt.in).findTypeForKey(tt.key)
if sep != end {
res := tt.in[sep:end]
if res != tt.out {
t.Errorf("%d:%s: was %q; want %q", i, tt.in, res, tt.out)
}
Expand Down
28 changes: 15 additions & 13 deletions internal/language/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
b = make([]byte, n)
copy(b, s.b[:oldStart])
} else {
b = s.b[:n:n]
b = s.b[:n]
}
copy(b[end:], s.b[oldEnd:])
s.b = b
Expand Down Expand Up @@ -483,7 +483,7 @@ func parseExtensions(scan *scanner) int {
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
case 'u': // https://www.ietf.org/rfc/rfc6067.txt
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
Expand All @@ -503,27 +503,29 @@ func parseExtension(scan *scanner) int {
last = scan.token
end = scan.end
}
// Scan key-type sequences. A key is of length 2 and may be followed
// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
end = scan.end
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
end = scan.end
}
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
if bytes.Compare(key, last) != 1 || scan.err != nil {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(ErrSyntax)
end = keyStart
keyStart := scan.start
end = scan.end
for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
end = scan.end
}
keys = append(keys, scan.b[keyStart:end])
}
sort.Stable(bytesSort{keys, 2})
if n := len(keys); n > 0 {
Expand All @@ -547,7 +549,7 @@ func parseExtension(scan *scanner) int {
break
}
}
case 't':
case 't': // https://www.ietf.org/rfc/rfc6497.txt
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
Expand Down
23 changes: 11 additions & 12 deletions internal/language/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,13 @@ func parseTests() []parseTest {
{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
{in: "en-u-c", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
Expand All @@ -179,9 +179,8 @@ func parseTests() []parseTest {
{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
// Invalid "u" extension. Drop invalid parts.
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, changed: true},
// LDML spec is not specific about it, but remove duplicates and return an error if the values differ.
{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
// No change as the result is a substring of the original!
Expand Down Expand Up @@ -351,8 +350,8 @@ func TestErrors(t *testing.T) {
{"aa-AB", mkInvalid("AB")},
// ill-formed wins over invalid.
{"ac-u", ErrSyntax},
{"ac-u-ca", ErrSyntax},
{"ac-u-ca-co-pinyin", ErrSyntax},
{"ac-u-ca", mkInvalid("ac")},
{"ac-u-ca-co-pinyin", mkInvalid("ac")},
{"noob", ErrSyntax},
}
for _, tt := range tests {
Expand Down
4 changes: 4 additions & 0 deletions language/language.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,10 @@ func (t Tag) Extensions() []Extension {
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
//
// If there are multiple types associated with a key, only the first will be
// returned. If there is no type associated with a key, it returns the empty
// string.
func (t Tag) TypeForKey(key string) string {
if !compact.Tag(t).MayHaveExtensions() {
if key != "rg" && key != "va" {
Expand Down
13 changes: 13 additions & 0 deletions language/language_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,13 @@ func TestCanonicalize(t *testing.T) {
{"en-GB-u-rg-usz", "en-GB-u-rg-usz", Raw},
{"en-GB-u-rg-usz-va-posix", "en-GB-u-rg-usz-va-posix", Raw},
{"en-GB-u-rg-usz-co-phonebk", "en-GB-u-co-phonebk-rg-usz", Raw},

// CVE-2020-28851
// invalid key-value pair of -u- extension.
{"ES-u-000-00", "es-u-000-00", Raw},
{"ES-u-000-00-v-00", "es-u-000-00-v-00", Raw},
// reordered and unknown extension.
{"ES-v-00-u-000-00", "es-u-000-00-v-00", Raw},
}
for i, tt := range tests {
in, _ := Raw.Parse(tt.in)
Expand Down Expand Up @@ -553,6 +560,12 @@ func TestTypeForKey(t *testing.T) {
{"rg", "en-u-rg-gbzzzz", "gbzzzz"},
{"nu", "en-u-co-phonebk-nu-arabic", "arabic"},
{"kc", "cmn-u-co-stroke", ""},
{"rg", "cmn-u-rg", ""},
{"rg", "cmn-u-rg-co-stroke", ""},
{"co", "cmn-u-rg-co-stroke", "stroke"},
{"co", "cmn-u-co-rg-gbzzzz", ""},
{"rg", "cmn-u-co-rg-gbzzzz", "gbzzzz"},
{"rg", "cmn-u-rg-gbzzzz-nlzzzz", "gbzzzz"},
}
for _, tt := range tests {
if v := Make(tt.in).TypeForKey(tt.key); v != tt.out {
Expand Down
22 changes: 11 additions & 11 deletions language/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ func parseTests() []parseTest {
{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
{in: "en-u-c", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-co-phonebk", invalid: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-cu-xau", invalid: true, changed: true},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", invalid: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
Expand All @@ -117,8 +117,8 @@ func parseTests() []parseTest {
{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
// Invalid "u" extension. Drop invalid parts.
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-cu-xau"}, invalid: true},
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, invalid: true},
// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
// TODO: Consider eliminating duplicates and returning an error.
{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
Expand Down Expand Up @@ -219,8 +219,8 @@ func TestErrors(t *testing.T) {
{"aa-AB", mkInvalid("AB")},
// ill-formed wins over invalid.
{"ac-u", errSyntax},
{"ac-u-ca", errSyntax},
{"ac-u-ca-co-pinyin", errSyntax},
{"ac-u-ca", mkInvalid("ac")},
{"ac-u-ca-co-pinyin", mkInvalid("ac")},
{"noob", errSyntax},
}
for _, tt := range tests {
Expand Down

0 comments on commit e3aa4ad

Please sign in to comment.