-
Notifications
You must be signed in to change notification settings - Fork 24
[VAULT-21456] Add a drop in replacement for the Go regexp package that interns regular expressions #156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
[VAULT-21456] Add a drop in replacement for the Go regexp package that interns regular expressions #156
Changes from all commits
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
439bf50
Drop in replacement for the Go regexp package that does temporal cach…
sgmiller c35e29d
Weak value version: Implements a poor man's weak valued map, which c…
sgmiller 84aceb1
fix locking semantics
sgmiller 892773d
fix locking in other func, and use the right base maps
sgmiller cf697c9
Rename to be distinct from the original functions and thus not as sub…
sgmiller bda7c56
Simplify locking, add benchmarks
sgmiller fb4f51d
remove unneeded deps
sgmiller 006c382
Refactor the code to use weak pointers for weak map implementation
kubawi b2aa9bd
If a weak pointer is nil, remove the entry from maps
kubawi d583b64
Add regexp to the module list in the GitHub Actions go.yml workflow f…
kubawi 199d464
Update the Go directive in go.mod to version 1.24, so that it's clear…
kubawi 071bc6f
Swap RWMutex for a Mutex
kubawi 7eaf9de
Tidying up based on PR feedback; rename some variables to fix shadowi…
kubawi 0230859
Fix the formatting of go.yml to resolve a git conflict with main branch
kubawi d99b0b1
Refactor mustCompile to wrap compile in a closure to reduce code dupl…
kubawi 72b29d1
Merge branch 'main' into sgm/regexp-caching
kubawi 9c844f2
Tidy up imports in regexp/regexp_test.go
kubawi b21b980
Refactor compile and mustCompile to only delete the pointers from the…
kubawi ea0066a
Rename cleanup to cleanupCollectedPointers and refactor it to only de…
kubawi 0b4566a
Add a retry mechanism with a timeout to TestInternedRegexps
kubawi 5d9a94a
Add a test that ensures that the cleanup function does not confuse PO…
kubawi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| module github.com/hashicorp/go-secure-stdlib/regexp | ||
|
|
||
| go 1.24 | ||
|
|
||
| require github.com/stretchr/testify v1.10.0 | ||
|
|
||
| require ( | ||
| github.com/davecgh/go-spew v1.1.1 // indirect | ||
| github.com/pmezard/go-difflib v1.0.0 // indirect | ||
| gopkg.in/yaml.v3 v3.0.1 // indirect | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= | ||
| github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
| github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
| github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
| github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= | ||
| github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= | ||
| gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||
| gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
| gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= | ||
| gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| package regexp | ||
|
|
||
| import ( | ||
| "regexp" | ||
| "runtime" | ||
| "sync" | ||
| "weak" | ||
| ) | ||
|
|
||
| // Interns the compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result | ||
| // is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage. | ||
| // | ||
| // Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming | ||
| // unreachable, they will be eventually removed from the map and memory reclaimed. | ||
|
|
||
| var ( | ||
| weakMap = make(map[string]weak.Pointer[regexp.Regexp]) | ||
| posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp]) | ||
| reverseMap = make(map[weak.Pointer[regexp.Regexp]]string) | ||
| l sync.Mutex | ||
| ) | ||
|
|
||
| // CompileInterned compiles and interns a regular expression and returns a | ||
| // pointer to it or an error. | ||
| func CompileInterned(pattern string) (*regexp.Regexp, error) { | ||
| return compile(pattern, regexp.Compile, weakMap) | ||
| } | ||
|
|
||
| // CompilePOSIXInterned compiles and interns a regular expression using POSIX | ||
| // syntax and returns a pointer to it or an error. | ||
| func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) { | ||
| return compile(pattern, regexp.CompilePOSIX, posixWeakMap) | ||
| } | ||
|
|
||
| // MustCompileInterned compiles and interns a regular expression and returns a pointer to it or panics. | ||
| func MustCompileInterned(pattern string) *regexp.Regexp { | ||
| return mustCompile(pattern, regexp.MustCompile, weakMap) | ||
| } | ||
|
|
||
| // MustCompilePOSIXInterned compiles and interns a regular expression using | ||
| // POSIX syntax and returns a pointer to it or panics. | ||
| func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { | ||
| return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap) | ||
| } | ||
|
|
||
| // compile handles compiling and interning regular expressions. If the regexp is | ||
| // already interned, a pointer to it is returned from the map. If the regexp is | ||
| // not interned or is nil (since it's a weak pointer), it is compiled and stored | ||
| // in the maps. The regexp is stored in the maps as a weak pointer, so that it | ||
| // can be garbage collected. | ||
| func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), internedPointers map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { | ||
| l.Lock() | ||
| defer l.Unlock() | ||
| if itemPtr, ok := internedPointers[pattern]; ok { | ||
| ptr := itemPtr.Value() | ||
| if ptr != nil { | ||
| return ptr, nil | ||
| } | ||
| delete(internedPointers, pattern) | ||
| delete(reverseMap, itemPtr) | ||
| } | ||
| r, err := compileFunc(pattern) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| weakPointer := weak.Make(r) | ||
| internedPointers[pattern] = weakPointer | ||
| reverseMap[weakPointer] = pattern | ||
|
|
||
| // Register a cleanup function for the regexp object | ||
| cleanup := func(ptr weak.Pointer[regexp.Regexp]) { | ||
| cleanupCollectedPointers(ptr, internedPointers) | ||
| } | ||
| runtime.AddCleanup(r, cleanup, weakPointer) | ||
|
|
||
| return r, nil | ||
| } | ||
|
|
||
| // mustCompile is a wrapper around compile that is used when we want to panic | ||
| // instead of returning an error. If the regexp is already interned, a pointer | ||
| // to it is returned from the map. If the regexp is not interned or is nil | ||
| // (since it's a weak pointer), it is compiled and stored in the maps. The | ||
| // regexp is stored in the maps as a weak pointer, so that it can be garbage | ||
| // collected. | ||
| func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, internedPointers map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { | ||
| compileFunc := func(string) (*regexp.Regexp, error) { | ||
| return mustCompileFunc(pattern), nil | ||
| } | ||
| res, _ := compile(pattern, compileFunc, internedPointers) | ||
| return res | ||
| } | ||
|
|
||
| // cleanupCollectedPointers is a cleanup function for *regexp.Regexp. It removes | ||
| // the entries from relevant maps when the regexp object they point to is | ||
| // garbage collected. | ||
| func cleanupCollectedPointers(ptr weak.Pointer[regexp.Regexp], internedPointers map[string]weak.Pointer[regexp.Regexp]) { | ||
| l.Lock() | ||
| defer l.Unlock() | ||
| if s, ok := reverseMap[ptr]; ok { | ||
| delete(internedPointers, s) | ||
| delete(reverseMap, ptr) | ||
| } | ||
| } | ||
|
kubawi marked this conversation as resolved.
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,181 @@ | ||
| package regexp | ||
|
|
||
| import ( | ||
| "regexp" | ||
| "runtime" | ||
| "strconv" | ||
| "sync" | ||
| "testing" | ||
|
kubawi marked this conversation as resolved.
|
||
| "time" | ||
|
|
||
| "github.com/stretchr/testify/require" | ||
| ) | ||
|
|
||
| // TestInternedRegexps tests that the regular expressions are compiled correctly, | ||
| // are interned, and that the cleanup of interned regexps works as expected. | ||
| // | ||
| // Since this test depends on the garbage collector, it is not really | ||
| // deterministic and might flake in the future. If that happens, the calls to | ||
| // the garbage collector and the rest of the test should be removed. | ||
| func TestInternedRegexps(t *testing.T) { | ||
| testCases := map[string]struct { | ||
| compileFunc func(string) (*regexp.Regexp, error) | ||
| mustCompileFunc func(string) *regexp.Regexp | ||
| mustCompile bool | ||
| }{ | ||
| "CompileInterned": { | ||
| compileFunc: CompileInterned, | ||
| mustCompileFunc: nil, | ||
| mustCompile: false, | ||
| }, | ||
| "MustCompileInterned": { | ||
| compileFunc: nil, | ||
| mustCompileFunc: MustCompileInterned, | ||
| mustCompile: true, | ||
| }, | ||
| "CompilePOSIXInterned": { | ||
| compileFunc: CompilePOSIXInterned, | ||
| mustCompileFunc: nil, | ||
| mustCompile: false, | ||
| }, | ||
| "MustCompilePOSIXInterned": { | ||
| compileFunc: nil, | ||
| mustCompileFunc: MustCompilePOSIXInterned, | ||
| mustCompile: true, | ||
| }, | ||
| } | ||
| for name, tc := range testCases { | ||
| t.Run(name, func(t *testing.T) { | ||
| // Compile two identical regular expressions, their pointers should be the same. | ||
| var r1, r2 *regexp.Regexp | ||
| var err error | ||
| pattern := ".*" | ||
| if tc.mustCompile { | ||
| r1 = tc.mustCompileFunc(pattern) | ||
| r2 = tc.mustCompileFunc(pattern) | ||
| } else { | ||
| r1, err = tc.compileFunc(pattern) | ||
| require.NoError(t, err) | ||
| r2, err = tc.compileFunc(pattern) | ||
| require.NoError(t, err) | ||
|
|
||
| // While we're here, check that errors work as expected. | ||
| _, err = tc.compileFunc("(") | ||
| require.Error(t, err) | ||
| } | ||
| require.True(t, r1 == r2) | ||
|
|
||
| // Remove references to the regexps, and run the garbage collector in a loop to see if the cleanup happens. | ||
| r1 = nil | ||
| r2 = nil | ||
| deadline := time.Now().Add(10 * time.Second) | ||
| for { | ||
| // Run the garbage collector twice to increase chances of the cleanup happening. | ||
| // This still doesn't make it deterministic, but in local testing it was enough | ||
| // to not flake a single time in over two million runs, so it should be good enough. | ||
| // A single call to runtime.GC() was flaking very frequently in local testing. | ||
| runtime.GC() | ||
| runtime.GC() | ||
|
|
||
| // Ensure that the cleanup happened and the maps used for interning regexp are empty. | ||
| l.Lock() | ||
| wmlen := len(weakMap) | ||
| rmlen := len(reverseMap) | ||
| l.Unlock() | ||
|
|
||
| if wmlen == 0 && rmlen == 0 { | ||
| // Cleanup happened, test can exit successfully. | ||
| break | ||
| } | ||
| if time.Now().After(deadline) { | ||
| t.Fatalf("cleanup of interned regexps did not happen in time") | ||
| } | ||
| time.Sleep(1 * time.Second) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // TestCleanupCorrectKind tests that the cleanup function removes the correct | ||
| // kind (POSIX or non-POSIX) of regular expression from the correct backing maps. | ||
| func TestCleanupCorrectKind(t *testing.T) { | ||
| pattern := ".*" | ||
| // Compile a POSIX and a non-POSIX regular expression | ||
| posixRegexp, err := CompilePOSIXInterned(pattern) | ||
| require.NoError(t, err) | ||
| nonPosixRegexp, err := CompileInterned(pattern) | ||
| require.NoError(t, err) | ||
|
|
||
| // Ensure they are different pointers | ||
| require.NotEqual(t, posixRegexp, nonPosixRegexp) | ||
|
|
||
| // Manually run the cleanup function for the POSIX regular expression | ||
| cleanupCollectedPointers(posixWeakMap[pattern], posixWeakMap) | ||
|
|
||
| // Ensure that the POSIX regular expression was removed from the maps | ||
| l.Lock() | ||
| require.Len(t, posixWeakMap, 0) | ||
| require.Len(t, reverseMap, 1) | ||
| require.Len(t, weakMap, 1) | ||
| l.Unlock() | ||
|
|
||
| // Compile a new POSIX regular expression with the same pattern | ||
| posixRegexp, err = CompilePOSIXInterned(pattern) | ||
| require.NoError(t, err) | ||
|
|
||
| l.Lock() | ||
| require.Len(t, posixWeakMap, 1) | ||
| require.Len(t, reverseMap, 2) | ||
| require.Len(t, weakMap, 1) | ||
| l.Unlock() | ||
|
|
||
| // Manually run the cleanup function for the non-POSIX regular expression | ||
| cleanupCollectedPointers(weakMap[pattern], weakMap) | ||
| l.Lock() | ||
| require.Len(t, weakMap, 0) | ||
| require.Len(t, reverseMap, 1) | ||
| require.Len(t, posixWeakMap, 1) | ||
| l.Unlock() | ||
| } | ||
|
|
||
| func BenchmarkRegexps(b *testing.B) { | ||
| s := make([]*regexp.Regexp, b.N) | ||
| for i := 0; i < b.N; i++ { | ||
| s[i] = regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) | ||
| } | ||
| } | ||
|
|
||
| func BenchmarkInternedRegexps(b *testing.B) { | ||
| s := make([]*regexp.Regexp, b.N) | ||
| for i := 0; i < b.N; i++ { | ||
| s[i] = MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) | ||
| } | ||
| } | ||
|
|
||
| func BenchmarkConcurrentRegexps(b *testing.B) { | ||
| var wg sync.WaitGroup | ||
| for j := 0; j < runtime.NumCPU(); j++ { | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| for i := 0; i < b.N; i++ { | ||
| regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) | ||
| } | ||
| }() | ||
| } | ||
| wg.Wait() | ||
| } | ||
|
|
||
| func BenchmarkConcurrentInternedRegexps(b *testing.B) { | ||
| var wg sync.WaitGroup | ||
| for j := 0; j < runtime.NumCPU(); j++ { | ||
| wg.Add(1) | ||
| go func() { | ||
| defer wg.Done() | ||
| for i := 0; i < b.N; i++ { | ||
| MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) | ||
| } | ||
| }() | ||
| } | ||
| wg.Wait() | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.