Skip to content

Commit

Permalink
addresses issue #188
Browse files Browse the repository at this point in the history
prevent state explosions with epsilon transitions

Signed-off-by: Tim Bray <[email protected]>
  • Loading branch information
timbray committed Jun 9, 2024
1 parent 5eee82d commit 8eb41db
Show file tree
Hide file tree
Showing 15 changed files with 321 additions and 292 deletions.
17 changes: 8 additions & 9 deletions anything_but.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,19 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
nextField := newFieldMatcher()
successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
//DEBUG successStep.table.label = "(success)"
success := &faNext{steps: []*faState{successStep}}
success := &faNext{states: []*faState{successStep}}

ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField
ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField
return ret, nextField
}

// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
// makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
// the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is
// success but transfers to the next step on whatever the current byte in each of the vals that have not
// yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
// to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
// strings.
func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
// this will be the default transition in all the anything-but tables.
var u unpackedTable
for i := range u {
Expand Down Expand Up @@ -115,18 +114,18 @@ func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTa

// for each val that still has bytes to process, recurse to process the next one
for utf8Byte, val := range valsWithBytesRemaining {
nextTable := oneMultiAnythingButStep(val, index+1, success)
nextTable := makeOneMultiAnythingButStep(val, index+1, success)
nextStep := &faState{table: nextTable}
u[utf8Byte] = &faNext{steps: []*faState{nextStep}}
u[utf8Byte] = &faNext{states: []*faState{nextStep}}
}

// for each val that ends at 'index', put a failure-transition for this anything-but
// if you hit the valueTerminator, success for everything else
for utf8Byte := range valsEndingHere {
failState := &faState{table: newSmallTable()} // note no transitions
lastStep := &faNext{steps: []*faState{failState}}
lastStep := &faNext{states: []*faState{failState}}
lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep})
u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}}
u[utf8Byte] = &faNext{states: []*faState{{table: lastTable}}}
}

table := newSmallTable()
Expand Down
14 changes: 8 additions & 6 deletions cl2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,20 +187,20 @@ func TestRulerCl2(t *testing.T) {

// initial run to stabilize memory
bm := newBenchmarker()
bm.addRules(exactRules, exactMatches)
bm.addRules(exactRules, exactMatches, false)

bm.run(t, lines)

bm = newBenchmarker()
bm.addRules(exactRules, exactMatches)
bm.addRules(exactRules, exactMatches, true)
fmt.Printf("EXACT events/sec: %.1f\n", bm.run(t, lines))

bm = newBenchmarker()
bm.addRules(prefixRules, prefixMatches)
bm.addRules(prefixRules, prefixMatches, true)
fmt.Printf("PREFIX events/sec: %.1f\n", bm.run(t, lines))

bm = newBenchmarker()
bm.addRules(anythingButRules, anythingButMatches)
bm.addRules(anythingButRules, anythingButMatches, true)
fmt.Printf("ANYTHING-BUT events/sec: %.1f\n", bm.run(t, lines))
}

Expand All @@ -214,13 +214,15 @@ func newBenchmarker() *benchmarker {
return &benchmarker{q: q, wanted: make(map[X]int)}
}

func (bm *benchmarker) addRules(rules []string, wanted []int) {
func (bm *benchmarker) addRules(rules []string, wanted []int, report bool) {
for i, rule := range rules {
rname := fmt.Sprintf("r%d", i)
_ = bm.q.AddPattern(rname, rule)
bm.wanted[rname] = wanted[i]
}
fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
if report {
fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
}
}

func (bm *benchmarker) run(t *testing.T, events [][]byte) float64 {
Expand Down
2 changes: 1 addition & 1 deletion core_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func (m *coreMatcher) deletePatterns(_ X) error {
// matchesForJSONEvent calls the flattener to pull the fields out of the event and
// hands over to MatchesForFields
// This is a leftover from previous times, is only used by tests, but it's used by a *lot*
// so removing it would require a lot of tedious work
// and it's a convenient API for testing.
func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
fields, err := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions field_matcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

// fieldMatcher represents a state in the matching automaton, which matches field names and dispatches to
// valueMatcher to complete matching of field values.
// the fields that hold state are segregated in updateable so they can be replaced atomically and make the coreMatcher
// the fields that hold state are segregated in updateable, so they can be replaced atomically and make the coreMatcher
// thread-safe.
type fieldMatcher struct {
updateable atomic.Value // always holds an *fmFields
Expand Down Expand Up @@ -112,7 +112,7 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
}
freshStart.transitions[field.path] = vm

// suppose I'm adding the first pattern to a matcher and it has "x": [1, 2]. In principle the branches on
// suppose I'm adding the first pattern to a matcher, and it has "x": [1, 2]. In principle the branches on
// "x": 1 and "x": 2 could go to tne same next state. But we have to make a unique next state for each of them
// because some future other pattern might have "x": [2, 3] and thus we need a separate branch to potentially
// match two patterns on "x": 2 but not "x": 1. If you were optimizing the automaton for size you might detect
Expand Down
Loading

0 comments on commit 8eb41db

Please sign in to comment.