Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync lexer punctuations with ZetaSQL #182

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ func (l *Lexer) consumeToken() {
}

switch l.peek(0) {
case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '+', '-':
case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '%', ':',
// Belows are not yet used in Spanner.
'?', '\\', '$':
l.Token.Kind = token.TokenKind([]byte{l.skip()})
return
case '.':
Expand Down Expand Up @@ -151,6 +153,32 @@ func (l *Lexer) consumeToken() {
l.Token.Kind = ">"
}
return
case '+':
switch {
// KW_ADD_ASSIGN in ZetaSQL
case l.peekIs(1, '='):
l.skipN(2)
l.Token.Kind = "+="
default:
l.skip()
l.Token.Kind = "+"
}
return
case '-':
switch {
// KW_SUB_ASSIGN in ZetaSQL
case l.peekIs(1, '='):
l.skipN(2)
l.Token.Kind = "-="
// KW_LAMBDA_ARROW in ZetaSQL
case l.peekIs(1, '>'):
l.skipN(2)
l.Token.Kind = "->"
default:
l.skip()
l.Token.Kind = "-"
}
return
case '=':
switch {
case l.peekIs(1, '>'):
Expand All @@ -163,6 +191,9 @@ func (l *Lexer) consumeToken() {
return
case '|':
switch {
case l.peekIs(1, '>'):
l.skipN(2)
l.Token.Kind = "|>"
case l.peekIs(1, '|'):
l.skipN(2)
l.Token.Kind = "||"
Expand All @@ -177,7 +208,16 @@ func (l *Lexer) consumeToken() {
l.Token.Kind = "!="
return
}
l.skip()
l.Token.Kind = "!"
return
case '@':
// KW_DOUBLE_AT is not yet used in Cloud Spanner, but used in BigQuery.
if l.peekIs(1, '@') {
l.skipN(2)
l.Token.Kind = "@@"
return
}
if l.peekOk(1) && char.IsIdentStart(l.peek(1)) {
i := 1
for l.peekOk(i) && char.IsIdentPart(l.peek(i)) {
Expand Down
55 changes: 34 additions & 21 deletions lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,48 @@ import (
. "github.com/cloudspannerecosystem/memefish/token"
)

// Keep same order https://github.com/google/zetasql/blob/master/zetasql/parser/flex_tokenizer.l
var symbols = []string{
".",
",",
";",
"(",
")",
"{",
"}",
"[",
"{",
")",
"]",
"@",
"~",
"+",
"-",
"}",
"*",
"/",
"&",
"^",
"|",
"||",
",",
"=",
"<",
"<<",
"+=",
"-=",
"!=",
"<=",
"<>",
"<<",
"=>",
"->",
"<",
">",
">>",
">=",
"!=",
"||",
"|",
"^",
"&",
"+",
"-",
"/",
"~",
"?",
"!",
"%",
"|>",
"@",
"@@",
".",
":",
"\\",
";",
"$",
"<>", // <> is not a valid token in ZetaSQL, but it is a token in memefish
">>", // >> is not a valid token in ZetaSQL, but it is a token in memefish.
}

var lexerTestCases = []struct {
Expand Down Expand Up @@ -132,7 +145,7 @@ var lexerWrongTestCase = []struct {
pos Pos
message string
}{
{"?", 0, "illegal input character: '?'"},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it hard to rewrite this test instead of removing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I think there is no ASCII printable character remaining.
If we want to leave illegal character test case, we need to use unprintable character or some unicode chatecters.

I would try to use \b(beep character)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{"\b", 0, "illegal input character: '\\b'"},
{`"foo`, 0, "unclosed string literal"},
{`R"foo`, 1, "unclosed raw string literal"},
{"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"},
Expand Down
2 changes: 0 additions & 2 deletions split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ func TestSplitRawStatements(t *testing.T) {
want: []*memefish.RawStatement{
{Statement: "SELECT `1;2;3`", End: token.Pos(14)},
}},
// $` may become a valid token in the future, but it's reasonable to check its current behavior.
{desc: "unknown token", input: "SELECT $;", errRe: regexp.MustCompile(`illegal input character: '\$'`)},
} {
t.Run(test.desc, func(t *testing.T) {
stmts, err := memefish.SplitRawStatements("", test.input)
Expand Down
Loading