support parsing SQL with encodings other than utf8 (#1312)

pingcap · Sep 13, 2021 · d551970 · d551970
1 parent ea70ab7
commit d551970
Show file tree

Hide file tree

Showing 10 changed files with 335 additions and 44 deletions.
diff --git a/charset/charset.go b/charset/charset.go
@@ -107,6 +107,16 @@ func ValidCharsetAndCollation(cs string, co string) bool {
 	return ok
 }
 
+// GetDefaultCollationLegacy is compatible with the charset support in old version parser.
+func GetDefaultCollationLegacy(charset string) (string, error) {
+	switch strings.ToLower(charset) {
+	case CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin:
+		return GetDefaultCollation(charset)
+	default:
+		return "", errors.Errorf("Unknown charset %s", charset)
+	}
+}
+
 // GetDefaultCollation returns the default collation for charset.
 func GetDefaultCollation(charset string) (string, error) {
 	cs, err := GetCharsetInfo(charset)

diff --git a/charset/encoding.go b/charset/encoding.go
@@ -0,0 +1,137 @@
+// Copyright 2021 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package charset
+
+import (
+	"strings"
+
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/transform"
+)
+
+const (
+	encodingBufferSizeDefault          = 1024
+	encodingBufferSizeRecycleThreshold = 4 * 1024
+
+	encodingDefault = "utf-8"
+)
+
+type EncodingLabel string
+
+// Format trim and change the label to lowercase.
+func Format(label string) EncodingLabel {
+	return EncodingLabel(strings.ToLower(strings.Trim(label, "\t\n\r\f ")))
+}
+
+// Formatted is used when the label is already trimmed and it is lowercase.
+func Formatted(label string) EncodingLabel {
+	return EncodingLabel(label)
+}
+
+// Encoding provide a interface to encode/decode a string with specific encoding.
+type Encoding struct {
+	enc        encoding.Encoding
+	name       string
+	charLength func([]byte) int
+	buffer     []byte
+}
+
+// Enabled indicates whether the non-utf8 encoding is used.
+func (e *Encoding) Enabled() bool {
+	return e.enc != nil && e.charLength != nil
+}
+
+// Name returns the name of the current encoding.
+func (e *Encoding) Name() string {
+	return e.name
+}
+
+// NewEncoding creates a new Encoding.
+func NewEncoding(label EncodingLabel) *Encoding {
+	if len(label) == 0 {
+		return &Encoding{}
+	}
+	e, name := lookup(label)
+	if e != nil && name != encodingDefault {
+		return &Encoding{
+			enc:        e,
+			name:       name,
+			charLength: FindNextCharacterLength(name),
+			buffer:     make([]byte, encodingBufferSizeDefault),
+		}
+	}
+	return &Encoding{name: name}
+}
+
+// UpdateEncoding updates to a new Encoding without changing the buffer.
+func (e *Encoding) UpdateEncoding(label EncodingLabel) {
+	enc, name := lookup(label)
+	e.name = name
+	if enc != nil && name != encodingDefault {
+		e.enc = enc
+	}
+	if len(e.buffer) == 0 {
+		e.buffer = make([]byte, encodingBufferSizeDefault)
+	}
+}
+
+// Encode encodes the bytes to a string.
+func (e *Encoding) Encode(src []byte) (string, bool) {
+	return e.transform(e.enc.NewEncoder(), src)
+}
+
+// Decode decodes the bytes to a string.
+func (e *Encoding) Decode(src []byte) (string, bool) {
+	return e.transform(e.enc.NewDecoder(), src)
+}
+
+func (e *Encoding) transform(transformer transform.Transformer, src []byte) (string, bool) {
+	if len(e.buffer) < len(src) {
+		e.buffer = make([]byte, len(src)*2)
+	}
+	var destOffset, srcOffset int
+	ok := true
+	for {
+		nextLen := 4
+		if e.charLength != nil {
+			nextLen = e.charLength(src[srcOffset:])
+		}
+		srcEnd := srcOffset + nextLen
+		if srcEnd > len(src) {
+			srcEnd = len(src)
+		}
+		nDest, nSrc, err := transformer.Transform(e.buffer[destOffset:], src[srcOffset:srcEnd], false)
+		destOffset += nDest
+		srcOffset += nSrc
+		if err == nil {
+			if srcOffset >= len(src) {
+				result := string(e.buffer[:destOffset])
+				if len(e.buffer) > encodingBufferSizeRecycleThreshold {
+					// This prevents Encoding from holding too much memory.
+					e.buffer = make([]byte, encodingBufferSizeDefault)
+				}
+				return result, ok
+			}
+		} else if err == transform.ErrShortDst {
+			newDest := make([]byte, len(e.buffer)*2)
+			copy(newDest, e.buffer)
+			e.buffer = newDest
+		} else {
+			e.buffer[destOffset] = byte('?')
+			destOffset += 1
+			srcOffset += 1
+			ok = false
+		}
+	}
+}
diff --git a/charset/encoding_table.go b/charset/encoding_table.go
@@ -31,7 +31,11 @@ import (
 // leading and trailing whitespace.
 func Lookup(label string) (e encoding.Encoding, name string) {
 	label = strings.ToLower(strings.Trim(label, "\t\n\r\f "))
-	enc := encodings[label]
+	return lookup(Formatted(label))
+}
+
+func lookup(label EncodingLabel) (e encoding.Encoding, name string) {
+	enc := encodings[string(label)]
 	return enc.e, enc.name
 }
 
@@ -258,3 +262,32 @@ var encodings = map[string]struct {
 	"utf-16le":            {unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), "utf-16le"},
 	"x-user-defined":      {charmap.XUserDefined, "x-user-defined"},
 }
+
+// FindNextCharacterLength is used in lexer.peek() to determine the next character length.
+func FindNextCharacterLength(label string) func([]byte) int {
+	if f, ok := encodingNextCharacterLength[label]; ok {
+		return f
+	}
+	return nil
+}
+
+var encodingNextCharacterLength = map[string]func([]byte) int{
+	// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram
+	"gbk": func(bs []byte) int {
+		if len(bs) == 0 || bs[0] < 0x80 {
+			// A byte in the range 00–7F is a single byte that means the same thing as it does in ASCII.
+			return 1
+		}
+		return 2
+	},
+	"utf-8": func(bs []byte) int {
+		if len(bs) == 0 || bs[0] < 0x80 {
+			return 1
+		} else if bs[0] < 0xe0 {
+			return 2
+		} else if bs[0] < 0xf0 {
+			return 3
+		}
+		return 4
+	},
+}
diff --git a/hintparserimpl.go b/hintparserimpl.go
@@ -129,11 +129,11 @@ func (hp *hintParser) parse(input string, sqlMode mysql.SQLMode, initPos Pos) ([
 	hp.result = nil
 	hp.lexer.reset(input[3:])
 	hp.lexer.SetSQLMode(sqlMode)
-	hp.lexer.r.p = Pos{
+	hp.lexer.r.updatePos(Pos{
 		Line:   initPos.Line,
 		Col:    initPos.Col + 3, // skipped the initial '/*+'
 		Offset: 0,
-	}
+	})
 	hp.lexer.inBangComment = true // skip the final '*/' (we need the '*/' for reporting warnings)
 
 	yyhintParse(&hp.lexer, hp)