Skip to content

Commit

Permalink
support parsing SQL with encodings other than utf8 (#1312)
Browse files Browse the repository at this point in the history
  • Loading branch information
tangenta authored Sep 13, 2021
1 parent ea70ab7 commit d551970
Show file tree
Hide file tree
Showing 10 changed files with 335 additions and 44 deletions.
10 changes: 10 additions & 0 deletions charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ func ValidCharsetAndCollation(cs string, co string) bool {
return ok
}

// GetDefaultCollationLegacy is compatible with the charset support in old version parser.
func GetDefaultCollationLegacy(charset string) (string, error) {
switch strings.ToLower(charset) {
case CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin:
return GetDefaultCollation(charset)
default:
return "", errors.Errorf("Unknown charset %s", charset)
}
}

// GetDefaultCollation returns the default collation for charset.
func GetDefaultCollation(charset string) (string, error) {
cs, err := GetCharsetInfo(charset)
Expand Down
137 changes: 137 additions & 0 deletions charset/encoding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package charset

import (
"strings"

"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)

const (
encodingBufferSizeDefault = 1024
encodingBufferSizeRecycleThreshold = 4 * 1024

encodingDefault = "utf-8"
)

type EncodingLabel string

// Format trim and change the label to lowercase.
func Format(label string) EncodingLabel {
return EncodingLabel(strings.ToLower(strings.Trim(label, "\t\n\r\f ")))
}

// Formatted is used when the label is already trimmed and it is lowercase.
func Formatted(label string) EncodingLabel {
return EncodingLabel(label)
}

// Encoding provide a interface to encode/decode a string with specific encoding.
type Encoding struct {
enc encoding.Encoding
name string
charLength func([]byte) int
buffer []byte
}

// Enabled indicates whether the non-utf8 encoding is used.
func (e *Encoding) Enabled() bool {
return e.enc != nil && e.charLength != nil
}

// Name returns the name of the current encoding.
func (e *Encoding) Name() string {
return e.name
}

// NewEncoding creates a new Encoding.
func NewEncoding(label EncodingLabel) *Encoding {
if len(label) == 0 {
return &Encoding{}
}
e, name := lookup(label)
if e != nil && name != encodingDefault {
return &Encoding{
enc: e,
name: name,
charLength: FindNextCharacterLength(name),
buffer: make([]byte, encodingBufferSizeDefault),
}
}
return &Encoding{name: name}
}

// UpdateEncoding updates to a new Encoding without changing the buffer.
func (e *Encoding) UpdateEncoding(label EncodingLabel) {
enc, name := lookup(label)
e.name = name
if enc != nil && name != encodingDefault {
e.enc = enc
}
if len(e.buffer) == 0 {
e.buffer = make([]byte, encodingBufferSizeDefault)
}
}

// Encode encodes the bytes to a string.
func (e *Encoding) Encode(src []byte) (string, bool) {
return e.transform(e.enc.NewEncoder(), src)
}

// Decode decodes the bytes to a string.
func (e *Encoding) Decode(src []byte) (string, bool) {
return e.transform(e.enc.NewDecoder(), src)
}

func (e *Encoding) transform(transformer transform.Transformer, src []byte) (string, bool) {
if len(e.buffer) < len(src) {
e.buffer = make([]byte, len(src)*2)
}
var destOffset, srcOffset int
ok := true
for {
nextLen := 4
if e.charLength != nil {
nextLen = e.charLength(src[srcOffset:])
}
srcEnd := srcOffset + nextLen
if srcEnd > len(src) {
srcEnd = len(src)
}
nDest, nSrc, err := transformer.Transform(e.buffer[destOffset:], src[srcOffset:srcEnd], false)
destOffset += nDest
srcOffset += nSrc
if err == nil {
if srcOffset >= len(src) {
result := string(e.buffer[:destOffset])
if len(e.buffer) > encodingBufferSizeRecycleThreshold {
// This prevents Encoding from holding too much memory.
e.buffer = make([]byte, encodingBufferSizeDefault)
}
return result, ok
}
} else if err == transform.ErrShortDst {
newDest := make([]byte, len(e.buffer)*2)
copy(newDest, e.buffer)
e.buffer = newDest
} else {
e.buffer[destOffset] = byte('?')
destOffset += 1
srcOffset += 1
ok = false
}
}
}
35 changes: 34 additions & 1 deletion charset/encoding_table.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ import (
// leading and trailing whitespace.
func Lookup(label string) (e encoding.Encoding, name string) {
label = strings.ToLower(strings.Trim(label, "\t\n\r\f "))
enc := encodings[label]
return lookup(Formatted(label))
}

func lookup(label EncodingLabel) (e encoding.Encoding, name string) {
enc := encodings[string(label)]
return enc.e, enc.name
}

Expand Down Expand Up @@ -258,3 +262,32 @@ var encodings = map[string]struct {
"utf-16le": {unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), "utf-16le"},
"x-user-defined": {charmap.XUserDefined, "x-user-defined"},
}

// FindNextCharacterLength is used in lexer.peek() to determine the next character length.
func FindNextCharacterLength(label string) func([]byte) int {
if f, ok := encodingNextCharacterLength[label]; ok {
return f
}
return nil
}

var encodingNextCharacterLength = map[string]func([]byte) int{
// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram
"gbk": func(bs []byte) int {
if len(bs) == 0 || bs[0] < 0x80 {
// A byte in the range 00–7F is a single byte that means the same thing as it does in ASCII.
return 1
}
return 2
},
"utf-8": func(bs []byte) int {
if len(bs) == 0 || bs[0] < 0x80 {
return 1
} else if bs[0] < 0xe0 {
return 2
} else if bs[0] < 0xf0 {
return 3
}
return 4
},
}
4 changes: 2 additions & 2 deletions hintparserimpl.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ func (hp *hintParser) parse(input string, sqlMode mysql.SQLMode, initPos Pos) ([
hp.result = nil
hp.lexer.reset(input[3:])
hp.lexer.SetSQLMode(sqlMode)
hp.lexer.r.p = Pos{
hp.lexer.r.updatePos(Pos{
Line: initPos.Line,
Col: initPos.Col + 3, // skipped the initial '/*+'
Offset: 0,
}
})
hp.lexer.inBangComment = true // skip the final '*/' (we need the '*/' for reporting warnings)

yyhintParse(&hp.lexer, hp)
Expand Down
Loading

0 comments on commit d551970

Please sign in to comment.