addresses issue #188

prevent state explosions with epsilon transitions Signed-off-by: Tim Bray <[email protected]>
timbray · Jun 9, 2024 · 8eb41db · 8eb41db
1 parent 5eee82d
commit 8eb41db
Show file tree

Hide file tree

Showing 15 changed files with 321 additions and 292 deletions.
diff --git a/anything_but.go b/anything_but.go
@@ -73,20 +73,19 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
 func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	successStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-	//DEBUG successStep.table.label = "(success)"
-	success := &faNext{steps: []*faState{successStep}}
+	success := &faNext{states: []*faState{successStep}}
 
-	ret, _ := oneMultiAnythingButStep(vals, 0, success), nextField
+	ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField
 	return ret, nextField
 }
 
-// oneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
+// makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
 // the longest among the vals. So for each value from 0 through N, we make a smallTable whose default is
 // success but transfers to the next step on whatever the current byte in each of the vals that have not
 // yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
 // to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
 // strings.
-func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
+func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTable {
 	// this will be the default transition in all the anything-but tables.
 	var u unpackedTable
 	for i := range u {
@@ -115,18 +114,18 @@ func oneMultiAnythingButStep(vals [][]byte, index int, success *faNext) *smallTa
 
 	// for each val that still has bytes to process, recurse to process the next one
 	for utf8Byte, val := range valsWithBytesRemaining {
-		nextTable := oneMultiAnythingButStep(val, index+1, success)
+		nextTable := makeOneMultiAnythingButStep(val, index+1, success)
 		nextStep := &faState{table: nextTable}
-		u[utf8Byte] = &faNext{steps: []*faState{nextStep}}
+		u[utf8Byte] = &faNext{states: []*faState{nextStep}}
 	}
 
 	// for each val that ends at 'index', put a failure-transition for this anything-but
 	// if you hit the valueTerminator, success for everything else
 	for utf8Byte := range valsEndingHere {
 		failState := &faState{table: newSmallTable()} // note no transitions
-		lastStep := &faNext{steps: []*faState{failState}}
+		lastStep := &faNext{states: []*faState{failState}}
 		lastTable := makeSmallTable(success, []byte{valueTerminator}, []*faNext{lastStep})
-		u[utf8Byte] = &faNext{steps: []*faState{{table: lastTable}}}
+		u[utf8Byte] = &faNext{states: []*faState{{table: lastTable}}}
 	}
 
 	table := newSmallTable()

diff --git a/cl2_test.go b/cl2_test.go
@@ -187,20 +187,20 @@ func TestRulerCl2(t *testing.T) {
 
 	// initial run to stabilize memory
 	bm := newBenchmarker()
-	bm.addRules(exactRules, exactMatches)
+	bm.addRules(exactRules, exactMatches, false)
 
 	bm.run(t, lines)
 
 	bm = newBenchmarker()
-	bm.addRules(exactRules, exactMatches)
+	bm.addRules(exactRules, exactMatches, true)
 	fmt.Printf("EXACT events/sec: %.1f\n", bm.run(t, lines))
 
 	bm = newBenchmarker()
-	bm.addRules(prefixRules, prefixMatches)
+	bm.addRules(prefixRules, prefixMatches, true)
 	fmt.Printf("PREFIX events/sec: %.1f\n", bm.run(t, lines))
 
 	bm = newBenchmarker()
-	bm.addRules(anythingButRules, anythingButMatches)
+	bm.addRules(anythingButRules, anythingButMatches, true)
 	fmt.Printf("ANYTHING-BUT events/sec: %.1f\n", bm.run(t, lines))
 }
 
@@ -214,13 +214,15 @@ func newBenchmarker() *benchmarker {
 	return &benchmarker{q: q, wanted: make(map[X]int)}
 }
 
-func (bm *benchmarker) addRules(rules []string, wanted []int) {
+func (bm *benchmarker) addRules(rules []string, wanted []int, report bool) {
 	for i, rule := range rules {
 		rname := fmt.Sprintf("r%d", i)
 		_ = bm.q.AddPattern(rname, rule)
 		bm.wanted[rname] = wanted[i]
 	}
-	fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
+	if report {
+		fmt.Println(matcherStats(bm.q.matcher.(*coreMatcher)))
+	}
 }
 
 func (bm *benchmarker) run(t *testing.T, events [][]byte) float64 {

diff --git a/core_matcher.go b/core_matcher.go
@@ -129,7 +129,7 @@ func (m *coreMatcher) deletePatterns(_ X) error {
 // matchesForJSONEvent calls the flattener to pull the fields out of the event and
 // hands over to MatchesForFields
 // This is a leftover from previous times, is only used by tests, but it's used by a *lot*
-// so removing it would require a lot of tedious work
+// and it's a convenient API for testing.
 func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
 	fields, err := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
 	if err != nil {

diff --git a/field_matcher.go b/field_matcher.go
@@ -6,7 +6,7 @@ import (
 
 // fieldMatcher represents a state in the matching automaton, which matches field names and dispatches to
 // valueMatcher to complete matching of field values.
-// the fields that hold state are segregated in updateable so they can be replaced atomically and make the coreMatcher
+// the fields that hold state are segregated in updateable, so they can be replaced atomically and make the coreMatcher
 // thread-safe.
 type fieldMatcher struct {
 	updateable atomic.Value // always holds an *fmFields
@@ -112,7 +112,7 @@ func (m *fieldMatcher) addTransition(field *patternField, printer printer) []*fi
 	}
 	freshStart.transitions[field.path] = vm
 
-	// suppose I'm adding the first pattern to a matcher and it has "x": [1, 2]. In principle the branches on
+	// suppose I'm adding the first pattern to a matcher, and it has "x": [1, 2]. In principle the branches on
 	//  "x": 1 and "x": 2 could go to tne same next state. But we have to make a unique next state for each of them
 	//  because some future other pattern might have "x": [2, 3] and thus we need a separate branch to potentially
 	//  match two patterns on "x": 2 but not "x": 1. If you were optimizing the automaton for size you might detect