diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..d1601b1 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,25 @@ +version: 2 +updates: + - package-ecosystem: "gomod" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 10 + labels: + - "dependencies" + - "go" + commit-message: + prefix: "chore" + include: "scope" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 10 + labels: + - "dependencies" + - "github-actions" + commit-message: + prefix: "chore" + include: "scope" diff --git a/.gitignore b/.gitignore index 4819e79..9e2b550 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .vscode *.iml __debug_bin* +.DS_Store \ No newline at end of file diff --git a/.golangci.yaml b/.golangci.yaml index 59c5e3d..fbb3ae5 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -19,6 +19,7 @@ output: linters: disable: - godot + - gci presets: - bugs - comment @@ -74,6 +75,7 @@ issues: - lll - gocognit - maintidx + - dupl test: "cognitive complexity 55 of func `Test_parseMatchExpr` is high (> 30)" - path: parser/token.go diff --git a/README.md b/README.md index fc41d3f..850331b 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,142 @@ -# KQL(kibana query language) Parser -![GitHub CI](https://github.com/laojianzi/kql-go/actions/workflows/ci.yaml/badge.svg) [![Go Report Card](https://goreportcard.com/badge/github.com/laojianzi/kql-go)](https://goreportcard.com/report/github.com/laojianzi/kql-go) [![LICENSE](https://img.shields.io/github/license/laojianzi/kql-go.svg)](https://github.com/laojianzi/kql-go/blob/master/LICENSE) [![GoDoc](https://img.shields.io/badge/Godoc-reference-blue.svg)](https://pkg.go.dev/github.com/laojianzi/kql-go) [![DeepSource](https://app.deepsource.com/gh/laojianzi/kql-go.svg/?label=code+coverage&show_trend=false&token=BgPgeWYICSssJGgLh2UosQw7)](https://app.deepsource.com/gh/laojianzi/kql-go/) +# KQL (Kibana Query Language) Parser -The goal of this project is to build a KQL(kibana query language) parser in Go with the following key features: +![GitHub CI](https://github.com/laojianzi/kql-go/actions/workflows/ci.yaml/badge.svg) +[![Go Report Card](https://goreportcard.com/badge/github.com/laojianzi/kql-go)](https://goreportcard.com/report/github.com/laojianzi/kql-go) +[![LICENSE](https://img.shields.io/github/license/laojianzi/kql-go.svg)](https://github.com/laojianzi/kql-go/blob/master/LICENSE) +[![GoDoc](https://img.shields.io/badge/Godoc-reference-blue.svg)](https://pkg.go.dev/github.com/laojianzi/kql-go) +[![DeepSource](https://app.deepsource.com/gh/laojianzi/kql-go.svg/?label=code+coverage&show_trend=false&token=BgPgeWYICSssJGgLh2UosQw7)](https://app.deepsource.com/gh/laojianzi/kql-go/) -- Parse KQL(kibana query language) query into AST -- output AST to KQL(kibana query language) query +A Kibana Query Language (KQL) parser implemented in Go. -This project is inspired by [github.com/AfterShip/clickhouse-sql-parser] and [https://github.com/cloudspannerecosystem/memefish]. Both of these are SQL parsers implemented in Go. +## Features -## How to use +- Escaped character handling +- Wildcard patterns +- Parentheses grouping +- AND/OR/NOT operators +- Field:value pairs +- String literals with quotes -Playground: https://go.dev/play/p/m36hkz43PQL +## Installation -```Go +```bash +go get github.com/laojianzi/kql-go +``` + +## Quick Start + +```go package main import ( + "fmt" + "github.com/laojianzi/kql-go/parser" ) -query := `(service_name: "redis" OR service_name: "mysql") AND level: ("error" OR "warn") and start_time > 1723286863 anD latency >= 1.5` -// Parse query into AST -stmt, err := parser.New(query).Stmt() -if err != nil { - panic(err) +func main() { + query := `(service_name: "redis" OR service_name: "mysql") AND level: ("error" OR "warn") and start_time > 1723286863 anD latency >= 1.5` + // Parse query into AST + stmt, err := parser.New(query).Stmt() + if err != nil { + panic(err) + } + + // output AST to KQL(kibana query language) query + fmt.Println(stmt.String()) + // output: + // (service_name: "redis" OR service_name: "mysql") AND level: ("error" OR "warn") AND start_time > 1723286863 AND latency >= 1.5 } +``` + +## Performance + +Recent benchmark results: -// output AST to KQL(kibana query language) query -fmt.Println(stmt.String()) -// output: -// (service_name: "redis" OR service_name: "mysql") AND level: ("error" OR "warn") AND start_time > 1723286863 AND latency >= 1.5 ``` +goos: darwin +goarch: amd64 +cpu: Intel(R) Core(TM) i5-10500 CPU @ 3.10GHz + +BenchmarkParser/simple_field-12 459882 2500 ns/op 1280 B/op 34 allocs/op +BenchmarkParser/numeric_comparison-12 728577 1646 ns/op 688 B/op 19 allocs/op +BenchmarkParser/multiple_conditions-12 211783 5966 ns/op 2385 B/op 62 allocs/op +BenchmarkParser/complex_query-12 63580 18675 ns/op 7235 B/op 168 allocs/op +BenchmarkParser/escaped_chars-12 108622 10926 ns/op 5416 B/op 131 allocs/op +BenchmarkParser/many_conditions-12 35870 34985 ns/op 12454 B/op 257 allocs/op +BenchmarkParserParallel/simple_field-12 1582999 773.8 ns/op 1280 B/op 34 allocs/op +BenchmarkParserParallel/numeric_comparison-12 2465758 468.9 ns/op 688 B/op 19 allocs/op +BenchmarkParserParallel/multiple_conditions-12 743210 1661 ns/op 2386 B/op 62 allocs/op +BenchmarkParserParallel/complex_query-12 219790 5692 ns/op 7238 B/op 168 allocs/op +BenchmarkParserParallel/escaped_chars-12 331581 3735 ns/op 5416 B/op 131 allocs/op +BenchmarkParserParallel/many_conditions-12 125736 9812 ns/op 12459 B/op 257 allocs/op +BenchmarkLexer/simple_field-12 572068 1947 ns/op 832 B/op 25 allocs/op +BenchmarkLexer/numeric_comparison-12 1000000 1082 ns/op 264 B/op 11 allocs/op +BenchmarkLexer/multiple_conditions-12 278456 4342 ns/op 1360 B/op 42 allocs/op +BenchmarkLexer/complex_query-12 77738 16504 ns/op 4768 B/op 119 allocs/op +BenchmarkLexer/escaped_chars-12 129708 8450 ns/op 3768 B/op 96 allocs/op +BenchmarkLexer/many_conditions-12 39974 29785 ns/op 8944 B/op 192 allocs/op +BenchmarkEscapeSequence/no_escape-12 581481 2017 ns/op 720 B/op 26 allocs/op +BenchmarkEscapeSequence/single_escape-12 487568 2400 ns/op 936 B/op 32 allocs/op +BenchmarkEscapeSequence/multiple_escapes-12 432496 2645 ns/op 1152 B/op 38 allocs/op +BenchmarkEscapeSequence/mixed_escapes-12 129600 9215 ns/op 3672 B/op 100 allocs/op +``` + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +### Development Requirements + +- Go 1.16 or higher +- golangci-lint for code quality checks + +### Running Tests + +```bash +# Run unit tests +go test -v -count=1 -race ./... + +# Run benchmarks +go test -bench=. -benchmem ./... +``` + +## Examples + +### Basic Queries +```go +// Simple field value query +query := `status: "active"` + +// Numeric comparison +query := `age >= 18` + +// Multiple conditions +query := `status: "active" AND age >= 18` +``` + +### Advanced Queries +```go +// Complex grouping with wildcards +query := `(status: "active" OR status: "pending") AND name: "john*"` + +// Escaped characters +query := `message: "Hello \"World\"" AND path: "C:\\Program Files\\*"` + +// Multiple conditions with various operators +query := `status: "active" AND age >= 18 AND name: "john*" AND city: "New York"` +``` + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +## Acknowledgments + +This project is inspired by: +- [github.com/AfterShip/clickhouse-sql-parser](https://github.com/AfterShip/clickhouse-sql-parser) +- [github.com/cloudspannerecosystem/memefish](https://github.com/cloudspannerecosystem/memefish) -## Contact us +## References -Feel free to open an issue or discussion if you have any issues or questions. +- [Kibana Query Language Documentation](https://www.elastic.co/guide/en/kibana/current/kuery-query.html) diff --git a/ast/binary_test.go b/ast/binary_test.go index 7beef8a..54a64fa 100644 --- a/ast/binary_test.go +++ b/ast/binary_test.go @@ -3,9 +3,10 @@ package ast_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func TestBinaryExpr(t *testing.T) { @@ -28,7 +29,7 @@ func TestBinaryExpr(t *testing.T) { name: `"v1"`, args: args{ pos: 0, - value: ast.NewLiteral(0, 4, token.TokenKindString, "v1"), + value: ast.NewLiteral(0, 4, token.TokenKindString, "v1", nil), hasNot: false, }, wantEnd: 4, @@ -38,7 +39,7 @@ func TestBinaryExpr(t *testing.T) { name: `NOT "v1"`, args: args{ pos: 0, - value: ast.NewLiteral(4, 8, token.TokenKindString, "v1"), + value: ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), hasNot: true, }, wantEnd: 8, @@ -49,7 +50,7 @@ func TestBinaryExpr(t *testing.T) { args: args{ field: "f1", operator: token.TokenKindOperatorEql, - value: ast.NewLiteral(4, 8, token.TokenKindString, "v1"), + value: ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), hasNot: false, }, wantEnd: 8, @@ -61,7 +62,7 @@ func TestBinaryExpr(t *testing.T) { pos: 0, field: "f1", operator: token.TokenKindOperatorEql, - value: ast.NewLiteral(8, 12, token.TokenKindString, "v1"), + value: ast.NewLiteral(8, 12, token.TokenKindString, "v1", nil), hasNot: true, }, wantEnd: 12, diff --git a/ast/combine_test.go b/ast/combine_test.go index 4334a97..3a7d396 100644 --- a/ast/combine_test.go +++ b/ast/combine_test.go @@ -3,9 +3,10 @@ package ast_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func TestCombineExpr(t *testing.T) { @@ -25,9 +26,9 @@ func TestCombineExpr(t *testing.T) { { name: `f1: "v1" OR NOT f1: "v2"`, args: args{ - leftExpr: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1"), false), + leftExpr: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), false), keyword: token.TokenKindKeywordOr, - rightExpr: ast.NewBinaryExpr(12, "f1", token.TokenKindOperatorEql, ast.NewLiteral(20, 24, token.TokenKindString, "v2"), true), + rightExpr: ast.NewBinaryExpr(12, "f1", token.TokenKindOperatorEql, ast.NewLiteral(20, 24, token.TokenKindString, "v2", nil), true), }, wantEnd: 24, wantString: `f1: "v1" OR NOT f1: "v2"`, @@ -43,15 +44,15 @@ func TestCombineExpr(t *testing.T) { 8, 22, ast.NewCombineExpr( - ast.NewBinaryExpr(9, "", 0, ast.NewLiteral(9, 13, token.TokenKindString, "v1"), false), + ast.NewBinaryExpr(9, "", 0, ast.NewLiteral(9, 13, token.TokenKindString, "v1", nil), false), token.TokenKindKeywordOr, - ast.NewBinaryExpr(17, "", 0, ast.NewLiteral(17, 21, token.TokenKindString, "v2"), false), + ast.NewBinaryExpr(17, "", 0, ast.NewLiteral(17, 21, token.TokenKindString, "v2", nil), false), ), ), true, ), keyword: token.TokenKindKeywordAnd, - rightExpr: ast.NewBinaryExpr(27, "f3", token.TokenKindOperatorEql, ast.NewLiteral(31, 35, token.TokenKindString, "v3"), false), + rightExpr: ast.NewBinaryExpr(27, "f3", token.TokenKindOperatorEql, ast.NewLiteral(31, 35, token.TokenKindString, "v3", nil), false), }, wantEnd: 35, wantString: `NOT f1: ("v1" OR "v2") AND f3: "v3"`, diff --git a/ast/literal.go b/ast/literal.go index 758ac70..b6a6ad9 100644 --- a/ast/literal.go +++ b/ast/literal.go @@ -4,21 +4,24 @@ import "github.com/laojianzi/kql-go/token" // Literal is a literal(int, float, string or identifier) value. type Literal struct { - pos int - end int + pos int + end int + escapeIndexes []int + Kind token.Kind // int, float, string or identifier Value string WithDoubleQuote bool } // NewLiteral creates a new literal value. -func NewLiteral(pos, end int, kind token.Kind, value string) *Literal { +func NewLiteral(pos, end int, kind token.Kind, value string, escapeIndexes []int) *Literal { return &Literal{ pos: pos, end: end, Kind: kind, Value: value, WithDoubleQuote: kind == token.TokenKindString, + escapeIndexes: escapeIndexes, } } @@ -34,9 +37,27 @@ func (e *Literal) End() int { // String returns the string representation of the literal value. func (e *Literal) String() string { + value := e.Value + + if len(e.escapeIndexes) > 0 { + var ( + runes = []rune(value) + newValue []rune + lastIndex int + ) + + for _, escapeIndex := range e.escapeIndexes { + newValue = append(newValue, runes[lastIndex:escapeIndex]...) + newValue = append(newValue, '\\') + lastIndex = escapeIndex + } + + value = string(append(newValue, runes[lastIndex:]...)) + } + if e.WithDoubleQuote { - return `"` + e.Value + `"` + return `"` + value + `"` } - return e.Value + return value } diff --git a/ast/literal_test.go b/ast/literal_test.go index e30cab3..4b1dd57 100644 --- a/ast/literal_test.go +++ b/ast/literal_test.go @@ -3,9 +3,10 @@ package ast_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func TestLiteral(t *testing.T) { @@ -73,7 +74,7 @@ func TestLiteral(t *testing.T) { for _, c := range cases { t.Run(c.name, func(t *testing.T) { - expr := ast.NewLiteral(c.args.pos, c.args.end, c.args.kind, c.args.value) + expr := ast.NewLiteral(c.args.pos, c.args.end, c.args.kind, c.args.value, nil) assert.Equal(t, c.wantPos, expr.Pos()) assert.Equal(t, c.wantEnd, expr.End()) assert.Equal(t, c.wantString, expr.String()) diff --git a/ast/paren_test.go b/ast/paren_test.go index 76cf2a4..b8057fd 100644 --- a/ast/paren_test.go +++ b/ast/paren_test.go @@ -3,9 +3,10 @@ package ast_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func TestParenExpr(t *testing.T) { @@ -25,7 +26,7 @@ func TestParenExpr(t *testing.T) { name: `(f1: "v1")`, args: args{ R: 10, - Expr: ast.NewBinaryExpr(1, "f1", token.TokenKindOperatorEql, ast.NewLiteral(5, 9, token.TokenKindString, "v1"), false), + Expr: ast.NewBinaryExpr(1, "f1", token.TokenKindOperatorEql, ast.NewLiteral(5, 9, token.TokenKindString, "v1", nil), false), }, wantEnd: 10, wantString: `(f1: "v1")`, @@ -35,9 +36,9 @@ func TestParenExpr(t *testing.T) { args: args{ R: 14, Expr: ast.NewCombineExpr( - ast.NewLiteral(1, 5, token.TokenKindString, "v1"), + ast.NewLiteral(1, 5, token.TokenKindString, "v1", nil), token.TokenKindKeywordOr, - ast.NewLiteral(9, 13, token.TokenKindString, "v2"), + ast.NewLiteral(9, 13, token.TokenKindString, "v2", nil), ), }, wantEnd: 14, diff --git a/ast/wildcard_test.go b/ast/wildcard_test.go index 0140a5e..a55eab2 100644 --- a/ast/wildcard_test.go +++ b/ast/wildcard_test.go @@ -3,9 +3,10 @@ package ast_test import ( "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func TestWildcard(t *testing.T) { @@ -145,7 +146,7 @@ func TestWildcard(t *testing.T) { for _, c := range cases { t.Run(c.name, func(t *testing.T) { - expr := ast.NewLiteral(c.args.pos, c.args.end, c.args.kind, c.args.value) + expr := ast.NewLiteral(c.args.pos, c.args.end, c.args.kind, c.args.value, nil) assert.Equal(t, c.wantPos, expr.Pos()) assert.Equal(t, c.wantEnd, expr.End()) assert.Equal(t, c.wantString, expr.String()) diff --git a/doc.go b/doc.go index eff4217..520310f 100644 --- a/doc.go +++ b/doc.go @@ -1,12 +1,36 @@ /* -The goal of this project is to build a KQL(kibana query language) parser in Go with the following key features: +Package kql-go provides a robust parser for Kibana Query Language (KQL). - - Parse KQL(kibana query language) query into AST +KQL is a simple yet powerful query language used in Kibana for filtering and searching data. +This package implements a complete parser that converts KQL queries into an Abstract Syntax Tree (AST). - - output AST to KQL(kibana query language) query +Basic Usage: -This project is inspired by [github.com/AfterShip/clickhouse-sql-parser] -and [https://github.com/cloudspannerecosystem/memefish]. -Both of these are SQL parsers implemented in Go. + import "github.com/laojianzi/kql-go" + + query := "response:200 AND (method:GET OR method:POST)" + ast, err := kql.Parse(query) + if err != nil { + log.Fatal(err) + } + +Features: + - Escaped character handling + - Wildcard patterns + - Parentheses grouping + - AND/OR/NOT operators + - Field:value pairs + - String literals with quotes + +Thread Safety: +The parser is designed to be thread-safe. Each Parse call creates a new parser instance, +making it safe to use across multiple goroutines. + +Error Handling: +The parser provides detailed error messages with position information, +making it easy to identify and fix syntax errors in queries. + +For more information about KQL syntax, visit: +https://www.elastic.co/guide/en/kibana/current/kuery-query.html */ package kql diff --git a/parser/bench_test.go b/parser/bench_test.go new file mode 100644 index 0000000..7e8dd29 --- /dev/null +++ b/parser/bench_test.go @@ -0,0 +1,129 @@ +package parser + +import ( + "testing" +) + +var benchmarkQueries = []struct { + name string + query string +}{ + { + name: "simple_field", + query: `status: "active"`, + }, + { + name: "numeric_comparison", + query: `age >= 18`, + }, + { + name: "multiple_conditions", + query: `status: "active" AND age >= 18`, + }, + { + name: "complex_query", + query: `(status: "active" OR status: "pending") AND age >= 18 AND name: "john*"`, + }, + { + name: "escaped_chars", + query: `message: "Hello \"World\"" AND path: "C:\\Program Files\\*"`, + }, + { + name: "many_conditions", + query: `status: "active" AND age >= 18 AND name: "john*" AND city: "New York" AND country: "USA" AND role: "admin"`, + }, +} + +func BenchmarkParser(b *testing.B) { + for _, bq := range benchmarkQueries { + b.Run(bq.name, func(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + stmt, err := New(bq.query).Stmt() + if err != nil { + b.Fatal(err) + } + + _ = stmt.String() + } + }) + } +} + +func BenchmarkParserParallel(b *testing.B) { + for _, bq := range benchmarkQueries { + b.Run(bq.name, func(b *testing.B) { + b.ReportAllocs() + + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + stmt, err := New(bq.query).Stmt() + if err != nil { + b.Fatal(err) + } + + _ = stmt.String() + } + }) + }) + } +} + +func BenchmarkLexer(b *testing.B) { + for _, bq := range benchmarkQueries { + b.Run(bq.name, func(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + l := newLexer(bq.query) + + for { + if l.nextToken(); l.eof() { + break + } + } + } + }) + } +} + +func BenchmarkEscapeSequence(b *testing.B) { + tests := []struct { + name string + input string + }{ + { + name: "no_escape", + input: `hello world`, + }, + { + name: "single_escape", + input: `hello \"world\"`, + }, + { + name: "multiple_escapes", + input: `\"hello\" \"world\"`, + }, + { + name: "mixed_escapes", + input: `path: "C:\\Program Files\\*" AND message: \"Hello World\"`, + }, + } + + for _, tt := range tests { + b.Run(tt.name, func(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + l := newLexer(tt.input) + + for { + if l.nextToken(); l.eof() { + break + } + } + } + }) + } +} diff --git a/parser/escape.go b/parser/escape.go new file mode 100644 index 0000000..c74bdbf --- /dev/null +++ b/parser/escape.go @@ -0,0 +1,118 @@ +package parser + +import ( + "bytes" + "errors" + "fmt" + + "github.com/laojianzi/kql-go/token" +) + +// CharProcResult represents the result of character processing in token lexing +type CharProcResult struct { + // Position represents the next position to process + Position int + // IsEscaped indicates if the next character should be treated as escaped + IsEscaped bool + // EscapeIndexes stores the positions of escape characters + EscapeIndexes []int +} + +// NewCharProcResult creates a new CharProcResult with the given values +func NewCharProcResult(position int, isEscaped bool, escapeIndexes []int) *CharProcResult { + return &CharProcResult{ + Position: position, + IsEscaped: isEscaped, + EscapeIndexes: escapeIndexes, + } +} + +// String returns a string representation of CharProcResult +func (r CharProcResult) String() string { + return fmt.Sprintf("CharProcResult{pos:%d, escaped:%v, indexes:%v}", + r.Position, r.IsEscaped, r.EscapeIndexes) +} + +// handleNonEscaped processes a non-escaped character and updates the token state +func (l *defaultLexer) handleNonEscaped(pos int, k token.Kind, buf *bytes.Buffer, indexes []int) *CharProcResult { + ch := l.peek(pos) + if ch == '\\' { + nextPos, newIndexes := l.handleBackslash(pos, buf, k == token.TokenKindString, indexes) + + return NewCharProcResult(nextPos, true, newIndexes) + } + + if k != token.TokenKindString && !l.processNonEscaped(pos, k) { + return NewCharProcResult(pos, false, indexes) + } + + buf.WriteRune(ch) + + return NewCharProcResult(pos+1, false, indexes) +} + +// handleEscaped processes an escaped character and updates the token state +func (l *defaultLexer) handleEscaped(pos int, k token.Kind, buf *bytes.Buffer, indexes []int) (*CharProcResult, error) { + valid, err := l.handleEscapeSequence(pos, k) + if err != nil { + return NewCharProcResult(pos, true, indexes), err + } + + if !valid { + return NewCharProcResult(pos, false, indexes), nil + } + + buf.WriteRune(l.peek(pos)) + + return NewCharProcResult(pos+1, false, indexes), nil +} + +// handleBackslash processes a backslash character and updates the token state +func (l *defaultLexer) handleBackslash(pos int, buf *bytes.Buffer, isString bool, indexes []int) (int, []int) { + l.Token.Value += buf.String() + buf.Reset() + + offset := 0 + if isString { + offset = -1 // adjust for opening quote + } + + return pos + 1, append(indexes, pos+offset-len(indexes)) +} + +// handleEscapeSequence validates and processes an escape sequence +func (l *defaultLexer) handleEscapeSequence(pos int, k token.Kind) (bool, error) { + if !l.peekOk(pos) { + return false, nil + } + + ch := l.peek(pos) + + // Handle string literals specially + if k == token.TokenKindString { + switch ch { + case '"', '\\', '*': + return true, nil + default: + return false, nil + } + } + + // Handle special cases first + if ch == '*' || token.RequireEscape(string(ch), k) { + return true, nil + } + + // Check if it's part of a keyword or operator + nextToken := l.collectNextToken(pos) + if token.IsKeyword(nextToken) || token.IsOperator(nextToken) { + return true, nil + } + + // If it's not a keyword or operator, check if it's a valid special character + if !token.IsSpecialChar(string(ch)) { + return false, errors.New("unexpected escapes") + } + + return true, nil +} diff --git a/parser/lexer.go b/parser/lexer.go index d28ecad..d674cc6 100644 --- a/parser/lexer.go +++ b/parser/lexer.go @@ -1,14 +1,17 @@ package parser import ( + "bytes" "errors" "fmt" + "strings" "unicode" "unicode/utf8" "github.com/laojianzi/kql-go/token" ) +// defaultLexer is a lexer implementation type defaultLexer struct { Value []rune Token Token @@ -18,6 +21,12 @@ type defaultLexer struct { dotIdent bool } +// newLexer creates a new lexer +func newLexer(input string) *defaultLexer { + return &defaultLexer{Value: []rune(strings.TrimSpace(input))} +} + +// nextToken returns the next token from the input stream func (l *defaultLexer) nextToken() error { l.lastTokenKind = l.Token.Kind @@ -31,6 +40,12 @@ func (l *defaultLexer) nextToken() error { } l.Token = Token{Pos: l.pos} + if l.eof() { + l.Token.Kind = token.TokenKindEof + + return nil + } + defer func() { l.Token.End = l.pos if l.Token.Kind == token.TokenKindString { // skip the double quote " @@ -48,13 +63,8 @@ func (l *defaultLexer) nextToken() error { return l.consumeFieldToken() } +// consumeToken consumes the next token from the input stream func (l *defaultLexer) consumeToken() error { - if l.eof() { - l.Token.Kind = token.TokenKindEof - - return nil - } - switch l.peek(0) { case ':', '<', '>': // operator return l.consumeOperator() @@ -72,6 +82,7 @@ func (l *defaultLexer) consumeToken() error { return l.consumeIdent() } +// consumeFieldToken consumes a field token func (l *defaultLexer) consumeFieldToken() error { if l.peekOk(0) && !unicode.IsSpace(rune(l.peek(0))) && !l.eof() { i := 0 @@ -89,27 +100,103 @@ func (l *defaultLexer) consumeFieldToken() error { return l.consumeToken() } -func (l *defaultLexer) consumeIdent() error { - var i int - for l.peekOk(i) && !unicode.IsSpace(rune(l.peek(i))) && !l.eof() { - if l.peek(i) == '\\' && l.peekOk(i+1) && requireEscape(l.peek(i+1)) { - i += 2 +// shouldBreak checks if token collection should stop +func (l *defaultLexer) shouldBreak(i int, isString, withEscape bool, endChar rune) bool { + if isString && !withEscape && l.peek(i) == endChar { + return true + } - continue + if !isString && !withEscape { + if unicode.IsSpace(l.peek(i)) || l.peek(i) == ')' { + return true } + } + + return false +} + +// collectNextToken collects the next complete token starting from the given position +func (l *defaultLexer) collectNextToken(start int) string { + buf := &bytes.Buffer{} + buf.WriteRune(l.peek(start)) - if requireEscape(l.peek(i)) { + for j := start; l.peekOk(j + 1); j++ { + nextRune := l.peek(j + 1) + if unicode.IsSpace(nextRune) || nextRune == ')' { break } - i++ + buf.WriteRune(nextRune) + } + + return buf.String() +} + +// processNonEscaped checks if a non-escaped character should break token collection +func (l *defaultLexer) processNonEscaped(i int, kind token.Kind) bool { + return !token.RequireEscape(string(l.peek(i)), kind) +} + +// consumeEscapedToken consumes a token that may contain escape sequences +// Returns the number of characters consumed, the positions of escape characters, and any error +func (l *defaultLexer) consumeEscapedToken(kind token.Kind, endChar rune) (i int, indexes []int, err error) { + escape, buf := false, &bytes.Buffer{} + + isString := kind == token.TokenKindString + if isString { + i = 1 // skip opening quote + } + + for l.peekOk(i) && !l.eof() { + if l.shouldBreak(i, isString, escape, endChar) { + break + } + + var result *CharProcResult + if escape { + result, err = l.handleEscaped(i, kind, buf, indexes) + } else { + result = l.handleNonEscaped(i, kind, buf, indexes) + } + + if err != nil { + return 0, nil, err + } + + if result.Position == i && !result.IsEscaped { + break + } + + i, escape, indexes = result.Position, result.IsEscaped, result.EscapeIndexes + } + + if escape { + return 0, nil, errors.New("unexpected escapes") + } + + if buf.Len() > 0 { + l.Token.Value += buf.String() + } + + return i, indexes, nil +} + +// consumeIdent consumes an identifier token +func (l *defaultLexer) consumeIdent() error { + i, escapeIndexes, err := l.consumeEscapedToken(token.TokenKindIdent, 0) + if err != nil { + return err } l.Token.Kind = token.TokenKindIdent - l.Token.Value = string(l.Value[l.pos : l.pos+i]) + l.Token.EscapeIndexes = escapeIndexes - if token.IsKeyword(l.Token.Value) { - l.Token.Kind = token.ToKeyword(l.Token.Value) + if !strings.Contains(l.slice(0, i), "\\") { + if token.IsKeyword(l.Token.Value) { + l.Token.Kind = token.ToKeyword(l.Token.Value) + } else if token.IsOperator(l.Token.Value) { + l.Token.Kind = token.ToOperator(l.Token.Value) + } } l.skipN(i) @@ -117,10 +204,11 @@ func (l *defaultLexer) consumeIdent() error { return nil } +// consumeString consumes a string token func (l *defaultLexer) consumeString() error { - i, endChar := 1, rune('"') - for l.peekOk(i) && l.peek(i) != endChar { - i++ + i, escapeIndexes, err := l.consumeEscapedToken(token.TokenKindString, '"') + if err != nil { + return err } if !l.peekOk(i) { @@ -128,13 +216,14 @@ func (l *defaultLexer) consumeString() error { } l.Token.Kind = token.TokenKindString - l.Token.Value = l.slice(1, i) + l.Token.EscapeIndexes = escapeIndexes l.skipN(i + 1) return nil } +// consumeNumber consumes a number token func (l *defaultLexer) consumeNumber() error { var i int if l.peek(0) == '+' || l.peek(0) == '-' { // skip sign @@ -158,6 +247,10 @@ func (l *defaultLexer) consumeNumber() error { } if !unicode.IsDigit(rune(b)) && b != '.' { + if b == '*' { + return l.consumeIdent() + } + return fmt.Errorf("expected digit or decimal point, but got %q", string(b)) } @@ -183,6 +276,7 @@ func (l *defaultLexer) consumeNumber() error { return nil } +// consumeOperator consumes an operator token func (l *defaultLexer) consumeOperator() error { length := 1 if (l.peek(0) == '<' || l.peek(0) == '>') && l.peekOk(1) && l.peek(1) == '=' { // <= or >= @@ -197,6 +291,7 @@ func (l *defaultLexer) consumeOperator() error { return nil } +// consumeParen consumes a parenthesis token func (l *defaultLexer) consumeParen() error { l.Token.Value = l.slice(0, 1) @@ -214,6 +309,7 @@ func (l *defaultLexer) consumeParen() error { return nil } +// skipSpaces skips whitespace characters func (l *defaultLexer) skipSpaces() { for !l.eof() { r, size := utf8.DecodeRuneInString(string(l.Value[l.pos:])) @@ -225,18 +321,22 @@ func (l *defaultLexer) skipSpaces() { } } +// skipN skips n characters func (l *defaultLexer) skipN(n int) { l.pos += n } +// peek returns the character at the given position func (l *defaultLexer) peek(i int) rune { return l.Value[l.pos+i] } +// peekOk checks if the character at the given position is valid func (l *defaultLexer) peekOk(i int) bool { return l.pos+i < len(l.Value) } +// slice returns a substring of the input string func (l *defaultLexer) slice(start, end int) string { if len(l.Value) < l.pos+end { end = len(l.Value) - l.pos @@ -245,10 +345,7 @@ func (l *defaultLexer) slice(start, end int) string { return string(l.Value[l.pos+start : l.pos+end]) } +// eof checks if the end of the input stream has been reached func (l *defaultLexer) eof() bool { return l.pos >= len(l.Value) } - -func requireEscape(r rune) bool { - return r == '"' || token.IsSpecialChar(string(r)) || r == '\\' -} diff --git a/parser/parser.go b/parser/parser.go index 4bdcfa3..c63b06b 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -16,7 +16,7 @@ type defaultParser struct { // New creates a new KQL parser. func New(input string) kql.Parser { - return &defaultParser{lexer: &defaultLexer{Value: []rune(strings.TrimSpace(input))}} + return &defaultParser{lexer: newLexer(input)} } // Stmt parses a statement from the input. @@ -120,6 +120,14 @@ func (p *defaultParser) parseBinary() (ast.Expr, error) { return nil, err } + // check >=, >, <=, < operator must be followed by Int|Float|Ident + switch op { + case token.TokenKindOperatorGeq, token.TokenKindOperatorGtr, token.TokenKindOperatorLeq, token.TokenKindOperatorLss: + if n := strings.ReplaceAll(right.String(), token.TokenKindWildcard.String(), ""); n != "" && !token.IsNumber(n) { + return nil, fmt.Errorf("expected number or number with wildcard, but got %q", n) + } + } + return ast.NewBinaryExpr(pos, expr.String(), op, right, hasNot), nil } @@ -171,14 +179,14 @@ func (p *defaultParser) parseWildcard() (ast.Expr, error) { end += 1 } - lit := ast.NewLiteral(pos, end, kind, tok.Value) + lit := ast.NewLiteral(pos, end, kind, tok.Value, tok.EscapeIndexes) if kind != token.TokenKindIdent && kind != token.TokenKindString { return lit, nil } var indexes []int - runes := []rune(tok.Value) + runes := []rune(lit.String()) for i := range runes { if runes[i] == '*' && (i == 0 || runes[i-1] != '\\') { // skip escaped wildcard indexes = append(indexes, i) diff --git a/parser/parser_test.go b/parser/parser_test.go index 0ced35e..dbe8d86 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -4,11 +4,12 @@ import ( "errors" "testing" + "github.com/stretchr/testify/assert" + "github.com/laojianzi/kql-go" "github.com/laojianzi/kql-go/ast" "github.com/laojianzi/kql-go/parser" "github.com/laojianzi/kql-go/token" - "github.com/stretchr/testify/assert" ) func Test_defaultParser_Stmt(t *testing.T) { @@ -19,39 +20,39 @@ func Test_defaultParser_Stmt(t *testing.T) { }{ { input: "foo", - want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo"), false), + want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo", nil), false), }, { input: "1", - want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 1, token.TokenKindInt, "1"), false), + want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 1, token.TokenKindInt, "1", nil), false), }, { input: "0.1", - want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindFloat, "0.1"), false), + want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindFloat, "0.1", nil), false), }, { input: `"0.1"`, - want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 5, token.TokenKindString, "0.1"), false), + want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 5, token.TokenKindString, "0.1", nil), false), }, { input: `f1: "v1"`, - want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1"), false), + want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), false), }, { input: `f1 > 1`, - want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorGtr, ast.NewLiteral(5, 6, token.TokenKindInt, "1"), false), + want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorGtr, ast.NewLiteral(5, 6, token.TokenKindInt, "1", nil), false), }, { input: `f1 < 1.1`, - want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorLss, ast.NewLiteral(5, 8, token.TokenKindFloat, "1.1"), false), + want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorLss, ast.NewLiteral(5, 8, token.TokenKindFloat, "1.1", nil), false), }, { input: `f1 >= 1000.0001`, - want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorGeq, ast.NewLiteral(6, 15, token.TokenKindFloat, "1000.0001"), false), + want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorGeq, ast.NewLiteral(6, 15, token.TokenKindFloat, "1000.0001", nil), false), }, { input: `f1 <= 100000011`, - want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorLeq, ast.NewLiteral(6, 15, token.TokenKindInt, "100000011"), false), + want: ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorLeq, ast.NewLiteral(6, 15, token.TokenKindInt, "100000011", nil), false), }, } @@ -72,22 +73,22 @@ func Test_defaultParser_Stmt(t *testing.T) { }{ { input: "NOT bar", - want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(4, 7, token.TokenKindIdent, "bar"), true), + want: ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(4, 7, token.TokenKindIdent, "bar", nil), true), }, { input: "foo AND bar", want: ast.NewCombineExpr( - ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo"), false), + ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(8, "", 0, ast.NewLiteral(8, 11, token.TokenKindIdent, "bar"), false), + ast.NewBinaryExpr(8, "", 0, ast.NewLiteral(8, 11, token.TokenKindIdent, "bar", nil), false), ), }, { input: "foo AND NOT bar", want: ast.NewCombineExpr( - ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo"), false), + ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(8, "", 0, ast.NewLiteral(12, 15, token.TokenKindIdent, "bar"), true), + ast.NewBinaryExpr(8, "", 0, ast.NewLiteral(12, 15, token.TokenKindIdent, "bar", nil), true), ), }, { @@ -96,38 +97,38 @@ func Test_defaultParser_Stmt(t *testing.T) { ast.NewCombineExpr( ast.NewCombineExpr( ast.NewCombineExpr( - ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 2, token.TokenKindIdent, "v1"), false), + ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 2, token.TokenKindIdent, "v1", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(7, "", 0, ast.NewLiteral(7, 8, token.TokenKindInt, "2"), false), + ast.NewBinaryExpr(7, "", 0, ast.NewLiteral(7, 8, token.TokenKindInt, "2", nil), false), ), token.TokenKindKeywordOr, - ast.NewBinaryExpr(12, "", 0, ast.NewLiteral(12, 15, token.TokenKindFloat, "0.3"), false), + ast.NewBinaryExpr(12, "", 0, ast.NewLiteral(12, 15, token.TokenKindFloat, "0.3", nil), false), ), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(20, "", 0, ast.NewLiteral(24, 28, token.TokenKindString, "v4"), true), + ast.NewBinaryExpr(20, "", 0, ast.NewLiteral(24, 28, token.TokenKindString, "v4", nil), true), ), token.TokenKindKeywordOr, - ast.NewBinaryExpr(32, "", 0, ast.NewLiteral(36, 39, token.TokenKindFloat, "5.0"), true), + ast.NewBinaryExpr(32, "", 0, ast.NewLiteral(36, 39, token.TokenKindFloat, "5.0", nil), true), ), }, { input: "NOT f: v", - want: ast.NewBinaryExpr(0, "f", token.TokenKindOperatorEql, ast.NewLiteral(7, 8, token.TokenKindIdent, "v"), true), + want: ast.NewBinaryExpr(0, "f", token.TokenKindOperatorEql, ast.NewLiteral(7, 8, token.TokenKindIdent, "v", nil), true), }, { input: `f1: "v1" AND f2 > 2`, want: ast.NewCombineExpr( - ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1"), false), + ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(18, 19, token.TokenKindInt, "2"), false), + ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(18, 19, token.TokenKindInt, "2", nil), false), ), }, { input: `f1: "v1" AND NOT f2 > 2`, want: ast.NewCombineExpr( - ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1"), false), + ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(22, 23, token.TokenKindInt, "2"), true), + ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(22, 23, token.TokenKindInt, "2", nil), true), ), }, { @@ -136,18 +137,18 @@ func Test_defaultParser_Stmt(t *testing.T) { ast.NewCombineExpr( ast.NewCombineExpr( ast.NewCombineExpr( - ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1"), false), + ast.NewBinaryExpr(0, "f1", token.TokenKindOperatorEql, ast.NewLiteral(4, 8, token.TokenKindString, "v1", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(18, 19, token.TokenKindInt, "2"), false), + ast.NewBinaryExpr(13, "f2", token.TokenKindOperatorGtr, ast.NewLiteral(18, 19, token.TokenKindInt, "2", nil), false), ), token.TokenKindKeywordOr, - ast.NewBinaryExpr(23, "f3", token.TokenKindOperatorLss, ast.NewLiteral(28, 31, token.TokenKindFloat, "0.3"), false), + ast.NewBinaryExpr(23, "f3", token.TokenKindOperatorLss, ast.NewLiteral(28, 31, token.TokenKindFloat, "0.3", nil), false), ), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(36, "f4", token.TokenKindOperatorGeq, ast.NewLiteral(46, 47, token.TokenKindInt, "4"), true), + ast.NewBinaryExpr(36, "f4", token.TokenKindOperatorGeq, ast.NewLiteral(46, 47, token.TokenKindInt, "4", nil), true), ), token.TokenKindKeywordOr, - ast.NewBinaryExpr(51, "f5", token.TokenKindOperatorLeq, ast.NewLiteral(61, 64, token.TokenKindFloat, "5.0"), true), + ast.NewBinaryExpr(51, "f5", token.TokenKindOperatorLeq, ast.NewLiteral(61, 64, token.TokenKindFloat, "5.0", nil), true), ), }, } @@ -170,9 +171,75 @@ func Test_defaultParser_Stmt(t *testing.T) { { input: "foo AND (NOT bar)", want: ast.NewCombineExpr( - ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo"), false), + ast.NewBinaryExpr(0, "", 0, ast.NewLiteral(0, 3, token.TokenKindIdent, "foo", nil), false), token.TokenKindKeywordAnd, - ast.NewBinaryExpr(8, "", 0, ast.NewParenExpr(8, 17, ast.NewBinaryExpr(9, "", 0, ast.NewLiteral(13, 16, token.TokenKindIdent, "bar"), true)), false), + ast.NewBinaryExpr(8, "", 0, ast.NewParenExpr(8, 17, ast.NewBinaryExpr(9, "", 0, ast.NewLiteral(13, 16, token.TokenKindIdent, "bar", nil), true)), false), + ), + }, + } + + for _, c := range cases { + t.Run(c.input, func(t *testing.T) { + stmt, err := parser.New(c.input).Stmt() + assert.NoError(t, err) + assert.EqualValues(t, c.want, stmt) + assert.Equal(t, c.input, stmt.String()) + }) + } + }) + + t.Run("with wildcard", func(t *testing.T) { + cases := []struct { + input string + want ast.Expr + }{ + { + input: "foo: *", + want: ast.NewBinaryExpr(0, "foo", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(5, 6, token.TokenKindIdent, "*", nil), + []int{0}, + ), false), + }, + { + input: `foo: * AND bar: *v2`, + want: ast.NewCombineExpr( + ast.NewBinaryExpr(0, "foo", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(5, 6, token.TokenKindIdent, "*", nil), + []int{0}, + ), false), + token.TokenKindKeywordAnd, + ast.NewBinaryExpr(11, "bar", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(16, 19, token.TokenKindIdent, "*v2", nil), + []int{0}, + ), false), + ), + }, + { + input: "foo: v*1", + want: ast.NewBinaryExpr(0, "foo", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(5, 8, token.TokenKindIdent, "v*1", nil), + []int{1}, + ), false), + }, + { + input: "foo: *0 AND bar: 1* AND 2*0", + want: ast.NewCombineExpr( + ast.NewCombineExpr( + ast.NewBinaryExpr(0, "foo", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(5, 7, token.TokenKindIdent, "*0", nil), + []int{0}, + ), false), + token.TokenKindKeywordAnd, + ast.NewBinaryExpr(12, "bar", token.TokenKindOperatorEql, ast.NewWildcardExpr( + ast.NewLiteral(17, 19, token.TokenKindIdent, "1*", nil), + []int{1}, + ), false), + ), + token.TokenKindKeywordAnd, + ast.NewBinaryExpr(24, "", 0, ast.NewWildcardExpr( + ast.NewLiteral(24, 27, token.TokenKindIdent, "2*0", nil), + []int{1}, + ), false), ), }, } @@ -223,3 +290,453 @@ func Test_defaultParser_Stmt(t *testing.T) { } }) } + +func TestParser_EscapedKeywords(t *testing.T) { + tests := []struct { + name string + input string + expected string + wantErr bool + }{ + { + name: "escaped AND keyword", + input: `app: \AND`, + expected: `app: \AND`, + wantErr: false, + }, + { + name: "escaped OR keyword", + input: `app: \OR`, + expected: `app: \OR`, + wantErr: false, + }, + { + name: "escaped NOT keyword", + input: `app: \NOT`, + expected: `app: \NOT`, + wantErr: false, + }, + { + name: "mix of escaped and normal keywords", + input: `app: foo AND msg: \OR`, + expected: `app: foo AND msg: \OR`, + wantErr: false, + }, + { + name: "multiple escaped keywords", + input: `app: \AND AND msg: \OR`, + expected: `app: \AND AND msg: \OR`, + wantErr: false, + }, + { + name: "escaped keyword in parentheses", + input: `(app: \AND)`, + expected: `(app: \AND)`, + wantErr: false, + }, + { + name: "escaped backslash before keyword", + input: `app: \\ AND msg: foo`, + expected: `app: \\ AND msg: foo`, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.input) + expr, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, expr.String()) + } + + assert.Equal(t, tt.wantErr, err != nil) + }) + } +} + +func TestParser_EscapedOperators(t *testing.T) { + tests := []struct { + name string + input string + expected string + wantErr bool + }{ + { + name: "escaped equality operator", + input: `app: \:`, + expected: `app: \:`, + wantErr: false, + }, + { + name: "escaped less than operator", + input: `app: \>`, + expected: `app: \>`, + wantErr: false, + }, + { + name: "escaped greater than operator", + input: `app: \<`, + expected: `app: \<`, + wantErr: false, + }, + { + name: "escaped less than or equal operator", + input: `app: \<=`, + expected: `app: \<=`, + wantErr: false, + }, + { + name: "escaped greater than or equal operator", + input: `app: \>=`, + expected: `app: \>=`, + wantErr: false, + }, + { + name: "mix of escaped operators and normal keywords", + input: `app: foo AND msg: \>`, + expected: `app: foo AND msg: \>`, + wantErr: false, + }, + { + name: "escaped operator in parentheses", + input: `(app: \>)`, + expected: `(app: \>)`, + wantErr: false, + }, + { + name: "escaped backslash before operator", + input: `app: \: AND msg: foo`, + expected: `app: \: AND msg: foo`, + wantErr: false, + }, + { + name: "escaped backslash before operator 2", + input: `app: \`, + expected: ``, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.input) + expr, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, expr.String()) + } + + assert.Equal(t, tt.wantErr, err != nil) + }) + } +} + +func TestParser_EscapedWildcard(t *testing.T) { + tests := []struct { + name string + input string + expected string + wantErr bool + }{ + { + name: "escaped wildcard", + input: `app: \*`, + expected: `app: \*`, + wantErr: false, + }, + { + name: "escaped wildcard in parentheses", + input: `(app: \*)`, + expected: `(app: \*)`, + wantErr: false, + }, + { + name: "escaped wildcard with AND", + input: `app: \* AND msg: 5*0`, + expected: `app: \* AND msg: 5*0`, + wantErr: false, + }, + { + name: "multiple escaped wildcards", + input: `\*escaped\*`, + expected: `\*escaped\*`, + wantErr: false, + }, + { + name: "escaped wildcard in string", + input: `message: "Hello \* World"`, + expected: `message: "Hello \* World"`, + wantErr: false, + }, + { + name: "escaped wildcard at string boundaries", + input: `message: "\*Hello World\*"`, + expected: `message: "\*Hello World\*"`, + wantErr: false, + }, + { + name: "escaped wildcard with other escapes", + input: `message: "\\* \"Hello\" \* World"`, + expected: `message: "\\* \"Hello\" \* World"`, + wantErr: false, + }, + { + name: "invalid escape sequence", + input: `message: \a`, + expected: ``, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.input) + expr, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + + return + } + + assert.NoError(t, err) + assert.Equal(t, tt.expected, expr.String()) + }) + } +} + +func TestParser_EscapedDoubleQuote(t *testing.T) { + tests := []struct { + name string + input string + expected string + wantErr bool + }{ + { + name: "escaped a double quote in ident", + input: `app: \"`, + expected: `app: \"`, + wantErr: false, + }, + { + name: "escaped double quote warped in ident", + input: `app: \"v1\"`, + expected: `app: \"v1\"`, + wantErr: false, + }, + { + name: "escaped double quote in ident", + input: `foo: b\"ar\"`, + expected: `foo: b\"ar\"`, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.input) + expr, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, expr.String()) + } + + assert.Equal(t, tt.wantErr, err != nil) + }) + } +} + +func TestParser_EscapedString(t *testing.T) { + tests := []struct { + name string + input string + expected string + wantErr bool + }{ + { + name: "escaped double quote in string", + input: `message: "hello \"world\""`, + expected: `message: "hello \"world\""`, + wantErr: false, + }, + { + name: "escaped backslash in string", + input: `message: "C:\\Program Files"`, + expected: `message: "C:\\Program Files"`, + wantErr: false, + }, + { + name: "escaped asterisk in string", + input: `message: "hello \* world"`, + expected: `message: "hello \* world"`, + wantErr: false, + }, + { + name: "multiple escaped characters", + input: `message: "path: \"C:\\Program Files\\*\""`, + expected: `message: "path: \"C:\\Program Files\\*\""`, + wantErr: false, + }, + { + name: "escaped characters in complex query", + input: `field1: "value with \"quotes\"" AND field2: "\*wildcard\*" OR field3: "back\\slash"`, + expected: `field1: "value with \"quotes\"" AND field2: "\*wildcard\*" OR field3: "back\\slash"`, + wantErr: false, + }, + { + name: "escaped characters in complex query with escaped double quotes", + input: `foo:\*bar AND field:\"value\"`, + expected: `foo: \*bar AND field: \"value\"`, + wantErr: false, + }, + { + name: "invalid escape sequence", + input: `message: "hello \n world"`, + wantErr: true, + }, + { + name: "unclosed string", + input: `message: "unclosed`, + wantErr: true, + }, + { + name: "invalid escape at end", + input: `message: "test\`, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.input) + expr, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expected, expr.String()) + } + + assert.Equal(t, tt.wantErr, err != nil) + }) + } +} + +func TestParser_EdgeCases(t *testing.T) { + tests := []struct { + name string + query string + }{ + { + name: "empty query", + query: "", + }, + { + name: "only whitespace", + query: " \t\n", + }, + { + name: "only operator", + query: "AND", + }, + { + name: "incomplete field value", + query: "field:", + }, + { + name: "missing value after operator", + query: "age >", + }, + { + name: "invalid numeric value", + query: "age > abc", + }, + { + name: "unmatched parenthesis", + query: "(field: value", + }, + { + name: "extra parenthesis", + query: "field: value)", + }, + { + name: "consecutive operators", + query: "field: value AND OR value2", + }, + { + name: "invalid field name", + query: "field space: value", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.query) + _, err := p.Stmt() + assert.Error(t, err) + }) + } +} + +func TestParser_ComplexQueries(t *testing.T) { + tests := []struct { + name string + query string + wantStr string + wantErr bool + }{ + { + name: "nested parentheses", + query: `((field1: value1 OR field2: value2) AND (field3: value3 OR field4: value4))`, + wantStr: `((field1: value1 OR field2: value2) AND (field3: value3 OR field4: value4))`, + }, + { + name: "mixed operators", + query: `field1: value1 AND NOT (field2: value2 OR field3: value3)`, + wantStr: `field1: value1 AND NOT (field2: value2 OR field3: value3)`, + }, + { + name: "multiple wildcards", + query: `field1: *val* AND field2: val?ue*`, + wantStr: `field1: *val* AND field2: val?ue*`, + }, + { + name: "mixed comparisons", + query: `age >= 18 AND score > 90 OR rank <= 3`, + wantStr: `age >= 18 AND score > 90 OR rank <= 3`, + }, + { + name: "complex escapes", + query: `message: "Hello \"World\"" AND path: "C:\\Program Files\\*" OR command: "\"quoted\""`, + wantStr: `message: "Hello \"World\"" AND path: "C:\\Program Files\\*" OR command: "\"quoted\""`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := parser.New(tt.query) + stmt, err := p.Stmt() + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.wantStr, stmt.String()) + } + + assert.Equal(t, tt.wantErr, err != nil) + }) + } +} diff --git a/parser/token.go b/parser/token.go index 2ecd500..f52a8b2 100644 --- a/parser/token.go +++ b/parser/token.go @@ -4,10 +4,11 @@ import "github.com/laojianzi/kql-go/token" // Token is a token parsed from lexer. type Token struct { - Pos int - End int - Kind token.Kind - Value string + Pos int + End int + Kind token.Kind + Value string + EscapeIndexes []int } // Clone returns a copy of the token. diff --git a/token/token.go b/token/token.go index 2684d4b..26d4840 100644 --- a/token/token.go +++ b/token/token.go @@ -2,7 +2,9 @@ package token import ( "fmt" + "regexp" "strings" + "unicode/utf8" ) // Kind represents token kind. @@ -125,6 +127,13 @@ func IsSpecialChar(s string) bool { return IsOperator(s) || s == TokenKindLparen.String() || s == TokenKindRparen.String() } +var numberRegex = regexp.MustCompile(`^[+-]?\d+(\.\d+)?$`) + +// IsNumber checks if the string is a number. +func IsNumber(s string) bool { + return numberRegex.MatchString(s) +} + // ToKeyword converts the string to a keyword Kind type. func ToKeyword(s string) Kind { kind, ok := keywords[strings.ToUpper(s)] @@ -164,3 +173,21 @@ func OperatorsExpected(got string) error { return fmt.Errorf("expected operator %s, but got %q", strings.Join(expectedList, "|"), got) } + +// RequireEscape checks if a character requires escaping in the given context +func RequireEscape(s string, kind Kind) bool { + if s == "" { + return false + } + + if r, _ := utf8.DecodeRuneInString(s); r == '"' || r == '\\' { + return true + } + + if kind == TokenKindString { + return false + } + + // only kind TokenKindIdent + return IsSpecialChar(s) || IsKeyword(s) +}