Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pat: correct numeric matching #329

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions PATTERNS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,19 @@ Thus, the following Pattern would match both JSON events above:
{"alpha": {"beta": [1]}}
```

### Numeric Values

It would be convenient if Quamina knew, for matching purposes, that 35,
35.00, and 3.5e1 were all the same number.

In many cases, Quamina can manage this. Specifically, for numbers that:

* are between -5.0e9 and 5.0e9 inclusive.
* have five or fewer fractional digits.

Numbers which do not meet these criteria will be treated as strings, which
usually produces good results.

## Extended Patterns
An **Extended Pattern** **MUST** be a JSON object containing
a single field whose name is called the **Pattern Type**.
Expand Down
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,8 @@ The `"exists":true` and `"exists":false` patterns
have corner cases; details are covered in
[Patterns in Quamina](PATTERNS.md).

Number matching is weak - the number has to appear
exactly the same in the Pattern and the Event. I.e.,
Quamina doesn't know that 35, 35.000, and 3.5e1 are the
same number. There's a fix for this in the code which
is not yet activated because it causes a
significant performance penalty, so the API needs to
be enhanced to only ask for it when you need it.
Quamina can match numeric values correctly, subject to
certain limits; details are in [Patterns in Quamina](PATTERNS.md).

## Flattening and Matching

Expand Down
128 changes: 64 additions & 64 deletions cl2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,127 +16,93 @@ var (
cl2LineCount int
)

/* This test adopted, with thanks, from aws/event-ruler */

func getCL2Lines(t *testing.T) [][]byte {
t.Helper()

cl2Lock.Lock()
defer cl2Lock.Unlock()
if cl2Lines != nil {
return cl2Lines
}
file, err := os.Open("testdata/citylots2.json.gz")
if err != nil {
t.Fatalf("Can't open citylots2.json.gz: %v", err.Error())
}
defer func(file *os.File) {
_ = file.Close()
}(file)
zr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("Can't open zip reader: %v", err.Error())
}

scanner := bufio.NewScanner(zr)
buf := make([]byte, oneMeg)
scanner.Buffer(buf, oneMeg)
for scanner.Scan() {
cl2LineCount++
cl2Lines = append(cl2Lines, []byte(scanner.Text()))
}
return cl2Lines
}

func TestRulerCl2(t *testing.T) {
exactRules := []string{
// test rules pulled out so that it's easy to write test funcs focusing in on one set of htem.
var (
anythingButRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"1430022\" ]\n" +
" }" +
" \"STREET\": [ { \"anything-but\": [ \"FULTON\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"2607117\" ]\n" +
" \"STREET\": [ { \"anything-but\": [ \"MASON\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"2607218\" ]\n" +
" \"ST_TYPE\": [ { \"anything-but\": [ \"ST\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"3745012\" ]\n" +
" \"geometry\": {\n" +
" \"type\": [ {\"anything-but\": [ \"Polygon\" ] } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ \"VACSTWIL\" ]\n" +
" \"FROM_ST\": [ { \"anything-but\": [ \"441\" ] } ]\n" +
" }\n" +
"}",
}
exactMatches := []int{1, 101, 35, 655, 1}

prefixRules := []string{
anythingButMatches = []int{211158, 210411, 96682, 120, 210615}
exactRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"AC\" } ]\n" +
" }\n" +
" \"MAPBLKLOT\": [ \"1430022\" ]\n" +
" }" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"BL\" } ]\n" +
" \"MAPBLKLOT\": [ \"2607117\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"DR\" } ]\n" +
" \"MAPBLKLOT\": [ \"2607218\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"FU\" } ]\n" +
" \"MAPBLKLOT\": [ \"3745012\" ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"RH\" } ]\n" +
" \"MAPBLKLOT\": [ \"VACSTWIL\" ]\n" +
" }\n" +
"}",
}
prefixMatches := []int{24, 442, 38, 2387, 328}

anythingButRules := []string{
exactMatches = []int{1, 101, 35, 655, 1}
prefixRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"anything-but\": [ \"FULTON\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"AC\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"anything-but\": [ \"MASON\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"BL\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"ST_TYPE\": [ { \"anything-but\": [ \"ST\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"DR\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"geometry\": {\n" +
" \"type\": [ {\"anything-but\": [ \"Polygon\" ] } ]\n" +
" \"properties\": {\n" +
" \"STREET\": [ { \"prefix\": \"FU\" } ]\n" +
" }\n" +
"}",
"{\n" +
" \"properties\": {\n" +
" \"FROM_ST\": [ { \"anything-but\": [ \"441\" ] } ]\n" +
" \"STREET\": [ { \"prefix\": \"RH\" } ]\n" +
" }\n" +
"}",
}
anythingButMatches := []int{211158, 210411, 96682, 120, 210615}

shellstyleRules := []string{
prefixMatches = []int{24, 442, 38, 2387, 328}
shellstyleRules = []string{
"{\n" +
" \"properties\": {\n" +
" \"MAPBLKLOT\": [ { \"shellstyle\": \"143*\" } ]\n" +
Expand All @@ -163,8 +129,7 @@ func TestRulerCl2(t *testing.T) {
" }\n" +
"}",
}
shellstyleMatches := []int{490, 713, 43, 2540, 1}

shellstyleMatches = []int{490, 713, 43, 2540, 1}
/* will add when we have numeric
complexArraysRules := []string{
"{\n" +
Expand Down Expand Up @@ -211,6 +176,41 @@ func TestRulerCl2(t *testing.T) {
complexArraysMatches := []int{227, 2, 149444, 64368, 127485}
*/

)

/* This test adopted, with thanks, from aws/event-ruler */

func getCL2Lines(t *testing.T) [][]byte {
t.Helper()

cl2Lock.Lock()
defer cl2Lock.Unlock()
if cl2Lines != nil {
return cl2Lines
}
file, err := os.Open("testdata/citylots2.json.gz")
if err != nil {
t.Fatalf("Can't open citylots2.json.gz: %v", err.Error())
}
defer func(file *os.File) {
_ = file.Close()
}(file)
zr, err := gzip.NewReader(file)
if err != nil {
t.Fatalf("Can't open zip reader: %v", err.Error())
}

scanner := bufio.NewScanner(zr)
buf := make([]byte, oneMeg)
scanner.Buffer(buf, oneMeg)
for scanner.Scan() {
cl2LineCount++
cl2Lines = append(cl2Lines, []byte(scanner.Text()))
}
return cl2Lines
}

func TestRulerCl2(t *testing.T) {
lines := getCL2Lines(t)
fmt.Printf("lines: %d\n", len(lines))

Expand Down
18 changes: 1 addition & 17 deletions field_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,6 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
var nextFieldMatchers []*fieldMatcher
for _, val := range field.vals {
nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(val, printer))

// if the val is a number, let's add a transition on the canonicalized number
// TODO: Only do this if asked
/*
if val.vType == numberType {
c, err := canonicalize([]byte(val.val))
if err == nil {
number := typedVal{
vType: literalType,
val: c,
}
nextFieldMatchers = append(nextFieldMatchers, vm.addTransition(number))
}
}
*/
}
m.update(freshStart)
return nextFieldMatchers
Expand All @@ -162,6 +147,5 @@ func (m *fieldMatcher) transitionOn(field *Field, bufs *bufpair) []*fieldMatcher
if !ok {
return nil
}

return valMatcher.transitionOn(field.Val, bufs)
return valMatcher.transitionOn(field, bufs)
}
47 changes: 14 additions & 33 deletions flatten_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ type flattenJSON struct {
isSpace [256]bool
}

// Reset an flattenJSON struct so it can be re-used and won't need to be reconstructed for each event to be flattened
// Reset a flattenJSON struct so that it can be re-used and won't need to be reconstructed for each event
// to be flattened
func (fj *flattenJSON) reset() {
fj.eventIndex = 0
fj.fields = fj.fields[:0]
Expand All @@ -43,7 +44,7 @@ var (
)

// errEarlyStop is used to signal the case when we've detected that we've read all the fields that appear in any pattern
// and so we don't need to read any more
// and so that we don't need to read any more
var errEarlyStop = errors.New("earlyStop")

// fjState - this is a finite state machine parser, or rather a collection of smaller FSM parsers. Some of these
Expand Down Expand Up @@ -232,7 +233,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
val, err = fj.readLiteral(nullBytes)
isLeaf = true
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
val, alt, err = fj.readNumber()
val, err = fj.readNumber()
isLeaf = true
case '[':
if !pathNode.IsSegmentUsed(memberName) {
Expand Down Expand Up @@ -367,7 +368,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
val, err = fj.readLiteral(nullBytes)
isLeaf = true
case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
val, alt, err = fj.readNumber()
val, err = fj.readNumber()
isLeaf = true
case '{':
if fj.skipping == 0 {
Expand Down Expand Up @@ -432,7 +433,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
* these higher-level funcs are going to advance the pointer after each invocation
*/

func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
func (fj *flattenJSON) readNumber() ([]byte, error) {
// points at the first character in the number
numStart := fj.eventIndex
state := fjNumberStartState
Expand All @@ -456,16 +457,9 @@ func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
state = fjNumberAfterEState
case ',', ']', '}', ' ', '\t', '\n', '\r':
fj.eventIndex--
// TODO: Too expensive; make it possible for people to ask for this
// bytes := fj.event[numStart : fj.eventIndex+1]
// c, err := canonicalize(bytes)
var alt []byte
//if err == nil {
// alt = []byte(c)
//}
return fj.event[numStart : fj.eventIndex+1], alt, nil
return fj.event[numStart : fj.eventIndex+1], nil
default:
return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
}
case fjNumberFracState:
switch ch {
Expand All @@ -474,24 +468,18 @@ func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
case ',', ']', '}', ' ', '\t', '\n', '\r':
fj.eventIndex--
bytes := fj.event[numStart : fj.eventIndex+1]
// TODO: Too expensive; make it possible for people to ask for this
// c, err := canonicalize(bytes)
var alt []byte
//if err == nil {
// alt = []byte(c)
//}
return bytes, alt, nil
return bytes, nil
case 'e', 'E':
state = fjNumberAfterEState
default:
return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
}
case fjNumberAfterEState:
switch ch {
case '-', '1', '2', '3', '4', '5', '6', '7', '8', '9':
// no-op
default:
return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
}
state = fjNumberExpState

Expand All @@ -501,20 +489,13 @@ func (fj *flattenJSON) readNumber() ([]byte, []byte, error) {
// no-op
case ',', ']', '}', ' ', '\t', '\n', '\r':
fj.eventIndex--
// bytes := fj.event[numStart : fj.eventIndex+1]
// TODO: Too expensive; make it possible for people to ask for this
// c, err := canonicalize(bytes)
var alt []byte
// if err == nil {
// alt = []byte(c)
// }
return fj.event[numStart : fj.eventIndex+1], alt, nil
return fj.event[numStart : fj.eventIndex+1], nil
default:
return nil, nil, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
return nil, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
}
}
if fj.step() != nil {
return nil, nil, fj.error("event truncated in number")
return nil, fj.error("event truncated in number")
}
}
}
Expand Down
1 change: 1 addition & 0 deletions flattener.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,6 @@ type ArrayPos struct {
type Field struct {
Path []byte
Val []byte
IsNumber bool
ArrayTrail []ArrayPos
}
Loading
Loading