Skip to content

Commit f0743c2

Browse files
committed
code: reduce all NFAs to DFAs
Related to #65 Signed-off-by: Tim Bray <[email protected]>
1 parent 25beaa4 commit f0743c2

8 files changed

+327
-203
lines changed

Diff for: README.md

+43-22
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
[![Tests](https://github.com/timbray/quamina/actions/workflows/go-unit-tests.yaml/badge.svg)](https://github.com/timbray/quamina/actions/workflows/go-unit-tests.yaml)
44
[![Latest Release](https://img.shields.io/github/release/timbray/quamina.svg?logo=github&style=flat-square)](https://github.com/timbray/quamina/releases/latest)
5-
[![codecov](https://codecov.io/gh/timbray/quamina/branch/main/graph/badge.svg?token=TC7MW723JO)](https://codecov.io/gh/timbray/quamina)
5+
[![codecov](https://codecov.io/gh/timbray/quamina/branch/main/graph/badge.svg?token=TC7MW723JO)](https://codecov.io/gh/timbray/quamina)
66
[![Go Report Card](https://goreportcard.com/badge/github.com/timbray/quamina)](https://goreportcard.com/report/github.com/timbray/quamina)
77
[![timbray/quamina](https://img.shields.io/github/go-mod/go-version/timbray/quamina)](https://github.com/timbray/quamina)
88
[![Go Reference](https://pkg.go.dev/badge/github.com/timbray/quamina.svg)](https://pkg.go.dev/github.com/timbray/quamina)
@@ -51,7 +51,6 @@ in RFC 8259:
5151
```
5252

5353
The following Patterns would match it:
54-
5554
```json
5655
{"Image": {"Width": [800]}}
5756
```
@@ -91,7 +90,6 @@ The following Patterns would match it:
9190
}
9291
}
9392
```
94-
9593
```json
9694
{
9795
"Image": {
@@ -275,25 +273,48 @@ Events through it as is practical.
275273

276274
I used to say that the performance of
277275
`MatchesForEvent` was `O(1)` in the number of
278-
Patterns. While that’s probably the right way to think
279-
about it, it’s not *quite* true,
280-
as it varies somewhat as a function of the number of
281-
unique fields that appear in all the Patterns that have
282-
been added to Quamina, but still remains sublinear
283-
in that number.
284-
285-
A word of explanation: Quamina compiles the
286-
Patterns into a somewhat-decorated automaton and uses
287-
that to find matches in Events; the matching process is
288-
`O(1)` in the number of Patterns.
289-
290-
However, for this to work, the incoming Event must be
291-
flattened into a list of pathname/value pairs and
292-
sorted. This process exceeds 50% of execution time,
293-
and is optimized by discarding any fields that
294-
do not appear in one or more of the Patterns added
295-
to Quamina. Thus, adding a new Pattern that only
296-
mentions fields which are already mentioned in previous
276+
Patterns. That’s probably a reasonable way to think
277+
about it, because it’s *almost* right.
278+
279+
To be correct, the performance is `O(N)` where `N` is
280+
the number of unique fields that appear in all the Patterns
281+
that have been added to Quamina.
282+
283+
For example, suppose you have a list of 50,000 words, and
284+
you add a Pattern for each, of the form:
285+
```json
286+
{"word": ["one of the words"]}
287+
```
288+
The performance in matching events should be about the same
289+
for one word or 50,000, with some marginal loss following on
290+
growth in the size of the necessary data structures.
291+
292+
However, adding another pattern that looks like the
293+
following would
294+
roughly speaking decrease the performance by a factor of
295+
roughly 2:
296+
```json
297+
{"number": [11, 22, 33]}
298+
```
299+
Then adding a few thousand more `"number"` patterns shouldn’t
300+
decrease the performance observably.
301+
302+
As always, it’s a little more complex than that, with a weak
303+
dependency on the size of the incoming Events; Quamina has
304+
to plow through them end-to-end to pull out the interesting
305+
fields.
306+
307+
A word of explanation: Quamina compiles the Patterns into a
308+
somewhat-decorated automaton and uses that to find matches in
309+
Events. For Quamina to work, the incoming Events must be flattened
310+
into a list of pathname/value pairs and sorted. This process
311+
exceeds 50% of execution time, and is optimized by discarding
312+
any fields that do not appear in one or more of the Patterns
313+
added to Quamina. Then, the cost of traversing the automaton
314+
is at most N, the number of fields left after discarding.
315+
316+
Thus, adding a new Pattern that only
317+
mentions fields which are already mentioned in previous
297318
Patterns is effectively free i.e. `O(1)` in terms of run-time
298319
performance.
299320

Diff for: list_maker.go

+58-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,64 @@
11
package quamina
22

33
// this needs to exist so that all all the lists containing a single step to X, or the triple step to X,Y,Z are the
4-
// same list, so that pack/unpack work properly
4+
// same list, so that pack/unpack work properly. In a large majority of cases, there's only one step in the list, so
5+
// those are handled straightforwardly with a map. Otherwise, we laboriously look through all the lists for a match.
6+
// In Java I'd implement a hashCode() method and everything would be a hash, but I haven't learned yet what the Go
7+
// equivalent is.
8+
type dfaMemory struct {
9+
singletons map[*nfaStep]*dfaStep
10+
plurals []perList
11+
}
12+
type perList struct {
13+
list []*nfaStep
14+
dfa *dfaStep
15+
}
16+
17+
func newDfaMemory() *dfaMemory {
18+
return &dfaMemory{singletons: make(map[*nfaStep]*dfaStep)}
19+
}
20+
21+
func (m *dfaMemory) rememberDfaForList(dfa *dfaStep, steps ...*nfaStep) {
22+
if len(steps) == 1 {
23+
m.singletons[steps[0]] = dfa
24+
} else {
25+
m.plurals = append(m.plurals, perList{list: steps, dfa: dfa})
26+
}
27+
}
28+
29+
func (m *dfaMemory) dfaForNfas(steps ...*nfaStep) (*dfaStep, bool) {
30+
if len(steps) == 1 {
31+
d, ok := m.singletons[steps[0]]
32+
return d, ok
33+
}
34+
for _, p := range m.plurals {
35+
if nfaListsEqual(p.list, steps) {
36+
return p.dfa, true
37+
}
38+
}
39+
return nil, false
40+
}
41+
42+
func nfaListsEqual(l1, l2 []*nfaStep) bool {
43+
if len(l1) != len(l2) {
44+
return false
45+
}
46+
for _, e1 := range l1 {
47+
if !nfaListContains(l2, e1) {
48+
return false
49+
}
50+
}
51+
return true
52+
}
53+
54+
func nfaListContains(list []*nfaStep, step *nfaStep) bool {
55+
for _, e := range list {
56+
if e == step {
57+
return true
58+
}
59+
}
60+
return false
61+
}
562

663
type listMaker struct {
764
singletons map[*nfaStep]*nfaStepList

Diff for: list_maker_test.go

+71
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,77 @@ import (
44
"testing"
55
)
66

7+
func TestDfaMemory(t *testing.T) {
8+
d1 := &dfaStep{}
9+
d3 := &dfaStep{}
10+
d12 := &dfaStep{}
11+
d13 := &dfaStep{}
12+
d123 := &dfaStep{}
13+
ns1 := &nfaStep{}
14+
ns2 := &nfaStep{}
15+
ns3 := &nfaStep{}
16+
l1 := []*nfaStep{ns1}
17+
l3 := []*nfaStep{ns3}
18+
l12 := []*nfaStep{ns1, ns2}
19+
l13 := []*nfaStep{ns1, ns3}
20+
l123 := []*nfaStep{ns1, ns2, ns3}
21+
22+
mem := newDfaMemory()
23+
mem.rememberDfaForList(d1, l1...)
24+
mem.rememberDfaForList(d3, l3...)
25+
mem.rememberDfaForList(d12, l12...)
26+
mem.rememberDfaForList(d13, l13...)
27+
mem.rememberDfaForList(d123, l123...)
28+
29+
var ok bool
30+
var d *dfaStep
31+
d, ok = mem.dfaForNfas(l1...)
32+
if ok == false || d != d1 {
33+
t.Error("failed d1")
34+
}
35+
d, ok = mem.dfaForNfas(l3...)
36+
if ok == false || d != d3 {
37+
t.Error("failed d1")
38+
}
39+
var shouldMatches [][]*nfaStep
40+
shouldMatches = [][]*nfaStep{{ns1, ns2}, {ns2, ns1}}
41+
for i, should := range shouldMatches {
42+
d, ok := mem.dfaForNfas(should...)
43+
if ok == false || d != d12 {
44+
t.Errorf("no match on %d", i)
45+
}
46+
}
47+
shouldMatches = [][]*nfaStep{{ns1, ns3}, {ns3, ns1}}
48+
for i, should := range shouldMatches {
49+
d, ok := mem.dfaForNfas(should...)
50+
if ok == false || d != d13 {
51+
t.Errorf("no match on %d", i)
52+
}
53+
}
54+
shouldMatches = [][]*nfaStep{{ns1, ns2, ns3}, {ns1, ns3, ns2}, {ns3, ns1, ns2}, {ns3, ns2, ns1}}
55+
for i, should := range shouldMatches {
56+
d, ok := mem.dfaForNfas(should...)
57+
if ok == false || d != d123 {
58+
t.Errorf("no match on %d", i)
59+
}
60+
}
61+
62+
noDfaFor := [][]*nfaStep{
63+
{&nfaStep{}},
64+
{ns2},
65+
{ns3, ns2},
66+
{ns1, ns2, &nfaStep{}},
67+
{ns1, ns2, ns3, &nfaStep{}},
68+
}
69+
70+
for i, no := range noDfaFor {
71+
_, ok = mem.dfaForNfas(no...)
72+
if ok {
73+
t.Errorf("bogus match %d", i)
74+
}
75+
}
76+
}
77+
778
func TestListMaker(t *testing.T) {
879
steps := []*nfaStep{
980
{},

Diff for: shell_style_test.go

+33-51
Original file line numberDiff line numberDiff line change
@@ -30,54 +30,6 @@ func TestLongCase(t *testing.T) {
3030
}
3131
}
3232

33-
func newNfaWithStart(start *smallTable[*nfaStepList]) *valueMatcher {
34-
vm := newValueMatcher()
35-
state := &vmFields{startNfa: start}
36-
vm.update(state)
37-
return vm
38-
}
39-
40-
func TestNfaMerging(t *testing.T) {
41-
aMatches := []string{
42-
`"Afoo"`,
43-
`"ABA"`,
44-
}
45-
bMatches := []string{
46-
`"BAB"`,
47-
`"Bbar"`,
48-
}
49-
f1 := &fieldMatcher{}
50-
f2 := &fieldMatcher{}
51-
nfa1, _ := makeShellStyleAutomaton([]byte(`"A*"`), f1)
52-
nfa2, _ := makeShellStyleAutomaton([]byte(`"B*"`), f2)
53-
54-
v1 := newNfaWithStart(nfa1)
55-
v2 := newNfaWithStart(nfa2)
56-
57-
for _, aMatch := range aMatches {
58-
t1 := v1.transitionOn([]byte(aMatch))
59-
if len(t1) != 1 || t1[0] != f1 {
60-
t.Error("mismatch on " + aMatch)
61-
}
62-
}
63-
for _, bMatch := range bMatches {
64-
t1 := v2.transitionOn([]byte(bMatch))
65-
if len(t1) != 1 || t1[0] != f2 {
66-
t.Error("mismatch on " + bMatch)
67-
}
68-
}
69-
70-
combo := mergeNfas(nfa1, nfa2)
71-
v3 := newNfaWithStart(combo)
72-
ab := append(aMatches, bMatches...)
73-
for _, match := range ab {
74-
t3 := v3.transitionOn([]byte(match))
75-
if len(t3) != 1 {
76-
t.Error("Fail on " + match)
77-
}
78-
}
79-
}
80-
8133
func TestMakeShellStyleAutomaton(t *testing.T) {
8234
patterns := []string{
8335
`"*ST"`,
@@ -104,29 +56,59 @@ func TestMakeShellStyleAutomaton(t *testing.T) {
10456
{`"ayybyyzxx"`},
10557
}
10658

59+
// NOTE also testing nfa2Dfa
10760
for i, pattern := range patterns {
10861
myNext := newFieldMatcher()
10962
a, wanted := makeShellStyleAutomaton([]byte(pattern), myNext)
11063
if wanted != myNext {
11164
t.Error("bad next on: " + pattern)
11265
}
66+
d := nfa2Dfa(a)
67+
vm := newValueMatcher()
68+
vmf := vmFields{startDfa: d}
69+
vm.update(&vmf)
11370
for _, should := range shouldsForPatterns[i] {
11471
var transitions []*fieldMatcher
115-
gotTrans := oneNfaStep(a, 0, []byte(should), transitions)
72+
gotTrans := transitionDfa(d, []byte(should), transitions)
11673
if len(gotTrans) != 1 || gotTrans[0] != wanted {
11774
t.Errorf("Failure for %s on %s", pattern, should)
11875
}
11976
}
12077
for _, shouldNot := range shouldNotForPatterns[i] {
12178
var transitions []*fieldMatcher
122-
gotTrans := oneNfaStep(a, 0, []byte(shouldNot), transitions)
79+
gotTrans := transitionDfa(d, []byte(shouldNot), transitions)
12380
if gotTrans != nil {
124-
t.Errorf("bogus match for %s on %s", pattern, shouldNot)
81+
t.Errorf("bogus DFA match for %s on %s", pattern, shouldNot)
12582
}
12683
}
12784
}
12885
}
12986

87+
/* To be used in profiling AddPattern for patterns which need NFAs
88+
func xTestShellStyleBuildTime(t *testing.T) {
89+
words := readWWords(t)
90+
starWords := make([]string, 0, len(words))
91+
patterns := make([]string, 0, len(words))
92+
for _, word := range words {
93+
starAt := rand.Int31n(6)
94+
starWord := string(word[:starAt]) + "*" + string(word[starAt:])
95+
starWords = append(starWords, starWord)
96+
pattern := fmt.Sprintf(`{"x": [ {"shellstyle": "%s" } ] }`, starWord)
97+
patterns = append(patterns, pattern)
98+
}
99+
q, _ := New()
100+
for i := 0; i < 32; i++ {
101+
// fmt.Printf("i=%d w=%s: %s\n", i, starWords[i], matcherStats(q.matcher.(*coreMatcher)))
102+
// fmt.Println(patterns[i])
103+
err := q.AddPattern(starWords[i], patterns[i])
104+
if err != nil {
105+
t.Error("AddP: " + err.Error())
106+
}
107+
}
108+
fmt.Println(matcherStats(q.matcher.(*coreMatcher)))
109+
}
110+
*/
111+
130112
func TestMixedPatterns(t *testing.T) {
131113
// let's mix up some prefix, infix, suffix, and exact-match searches
132114
x := map[string]int{

0 commit comments

Comments
 (0)