diff --git a/src/support/utf8.c b/src/support/utf8.c index 28c779b73b58b..ea7e970be6b51 100644 --- a/src/support/utf8.c +++ b/src/support/utf8.c @@ -570,6 +570,8 @@ int u8_isvalid(const char *str, size_t len) return 0; // Check for surrogate chars if (byt == 0xed && *pnt > 0x9f) return 0; + // Check for overlong encoding + if (byt == 0xe0 && *pnt < 0xa0) return 0; pnt += 2; } else { // 4-byte sequence // Must have 3 valid continuation characters diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 521dfa6d52b99..b2e14ac2f5451 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -467,9 +467,17 @@ end end end end + # Check for short three-byte sequences + @test isvalid(String, UInt8[0xe0]) == false + for (rng, flg) in ((0x00:0x9f, false), (0xa0:0xbf, true), (0xc0:0xff, false)) + for cont in rng + @test isvalid(String, UInt8[0xe0, cont]) == false + @test isvalid(String, UInt8[0xe0, cont, 0x80]) == flg + end + end # Check three-byte sequences - for r1 in (0xe0:0xec, 0xee:0xef) - for byt = r1 + for r1 in (0xe1:0xec, 0xee:0xef) + for byt in r1 # Check for short sequence @test isvalid(String, UInt8[byt]) == false for (rng,flg) in ((0x00:0x7f, false), (0x80:0xbf, true), (0xc0:0xff, false))