diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index b673694..98f232a 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -26,6 +26,7 @@ jobs: - "plugincontainer" - "pluginutil" - "random" + - "regexp" - "reloadutil" - "strutil" - "temperror" diff --git a/regexp/go.mod b/regexp/go.mod new file mode 100644 index 0000000..54a5eb9 --- /dev/null +++ b/regexp/go.mod @@ -0,0 +1,11 @@ +module github.com/hashicorp/go-secure-stdlib/regexp + +go 1.24 + +require github.com/stretchr/testify v1.10.0 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/regexp/go.sum b/regexp/go.sum new file mode 100644 index 0000000..713a0b4 --- /dev/null +++ b/regexp/go.sum @@ -0,0 +1,10 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/regexp/regexp.go b/regexp/regexp.go new file mode 100644 index 0000000..f258694 --- /dev/null +++ b/regexp/regexp.go @@ -0,0 +1,103 @@ +package regexp + +import ( + "regexp" + "runtime" + "sync" + "weak" +) + +// Interns the compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result +// is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage. +// +// Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming +// unreachable, they will be eventually removed from the map and memory reclaimed. + +var ( + weakMap = make(map[string]weak.Pointer[regexp.Regexp]) + posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp]) + reverseMap = make(map[weak.Pointer[regexp.Regexp]]string) + l sync.Mutex +) + +// CompileInterned compiles and interns a regular expression and returns a +// pointer to it or an error. +func CompileInterned(pattern string) (*regexp.Regexp, error) { + return compile(pattern, regexp.Compile, weakMap) +} + +// CompilePOSIXInterned compiles and interns a regular expression using POSIX +// syntax and returns a pointer to it or an error. +func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) { + return compile(pattern, regexp.CompilePOSIX, posixWeakMap) +} + +// MustCompileInterned compiles and interns a regular expression and returns a pointer to it or panics. +func MustCompileInterned(pattern string) *regexp.Regexp { + return mustCompile(pattern, regexp.MustCompile, weakMap) +} + +// MustCompilePOSIXInterned compiles and interns a regular expression using +// POSIX syntax and returns a pointer to it or panics. +func MustCompilePOSIXInterned(pattern string) *regexp.Regexp { + return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap) +} + +// compile handles compiling and interning regular expressions. If the regexp is +// already interned, a pointer to it is returned from the map. If the regexp is +// not interned or is nil (since it's a weak pointer), it is compiled and stored +// in the maps. The regexp is stored in the maps as a weak pointer, so that it +// can be garbage collected. +func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), internedPointers map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) { + l.Lock() + defer l.Unlock() + if itemPtr, ok := internedPointers[pattern]; ok { + ptr := itemPtr.Value() + if ptr != nil { + return ptr, nil + } + delete(internedPointers, pattern) + delete(reverseMap, itemPtr) + } + r, err := compileFunc(pattern) + if err != nil { + return nil, err + } + weakPointer := weak.Make(r) + internedPointers[pattern] = weakPointer + reverseMap[weakPointer] = pattern + + // Register a cleanup function for the regexp object + cleanup := func(ptr weak.Pointer[regexp.Regexp]) { + cleanupCollectedPointers(ptr, internedPointers) + } + runtime.AddCleanup(r, cleanup, weakPointer) + + return r, nil +} + +// mustCompile is a wrapper around compile that is used when we want to panic +// instead of returning an error. If the regexp is already interned, a pointer +// to it is returned from the map. If the regexp is not interned or is nil +// (since it's a weak pointer), it is compiled and stored in the maps. The +// regexp is stored in the maps as a weak pointer, so that it can be garbage +// collected. +func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, internedPointers map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp { + compileFunc := func(string) (*regexp.Regexp, error) { + return mustCompileFunc(pattern), nil + } + res, _ := compile(pattern, compileFunc, internedPointers) + return res +} + +// cleanupCollectedPointers is a cleanup function for *regexp.Regexp. It removes +// the entries from relevant maps when the regexp object they point to is +// garbage collected. +func cleanupCollectedPointers(ptr weak.Pointer[regexp.Regexp], internedPointers map[string]weak.Pointer[regexp.Regexp]) { + l.Lock() + defer l.Unlock() + if s, ok := reverseMap[ptr]; ok { + delete(internedPointers, s) + delete(reverseMap, ptr) + } +} diff --git a/regexp/regexp_test.go b/regexp/regexp_test.go new file mode 100644 index 0000000..1f29572 --- /dev/null +++ b/regexp/regexp_test.go @@ -0,0 +1,181 @@ +package regexp + +import ( + "regexp" + "runtime" + "strconv" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestInternedRegexps tests that the regular expressions are compiled correctly, +// are interned, and that the cleanup of interned regexps works as expected. +// +// Since this test depends on the garbage collector, it is not really +// deterministic and might flake in the future. If that happens, the calls to +// the garbage collector and the rest of the test should be removed. +func TestInternedRegexps(t *testing.T) { + testCases := map[string]struct { + compileFunc func(string) (*regexp.Regexp, error) + mustCompileFunc func(string) *regexp.Regexp + mustCompile bool + }{ + "CompileInterned": { + compileFunc: CompileInterned, + mustCompileFunc: nil, + mustCompile: false, + }, + "MustCompileInterned": { + compileFunc: nil, + mustCompileFunc: MustCompileInterned, + mustCompile: true, + }, + "CompilePOSIXInterned": { + compileFunc: CompilePOSIXInterned, + mustCompileFunc: nil, + mustCompile: false, + }, + "MustCompilePOSIXInterned": { + compileFunc: nil, + mustCompileFunc: MustCompilePOSIXInterned, + mustCompile: true, + }, + } + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + // Compile two identical regular expressions, their pointers should be the same. + var r1, r2 *regexp.Regexp + var err error + pattern := ".*" + if tc.mustCompile { + r1 = tc.mustCompileFunc(pattern) + r2 = tc.mustCompileFunc(pattern) + } else { + r1, err = tc.compileFunc(pattern) + require.NoError(t, err) + r2, err = tc.compileFunc(pattern) + require.NoError(t, err) + + // While we're here, check that errors work as expected. + _, err = tc.compileFunc("(") + require.Error(t, err) + } + require.True(t, r1 == r2) + + // Remove references to the regexps, and run the garbage collector in a loop to see if the cleanup happens. + r1 = nil + r2 = nil + deadline := time.Now().Add(10 * time.Second) + for { + // Run the garbage collector twice to increase chances of the cleanup happening. + // This still doesn't make it deterministic, but in local testing it was enough + // to not flake a single time in over two million runs, so it should be good enough. + // A single call to runtime.GC() was flaking very frequently in local testing. + runtime.GC() + runtime.GC() + + // Ensure that the cleanup happened and the maps used for interning regexp are empty. + l.Lock() + wmlen := len(weakMap) + rmlen := len(reverseMap) + l.Unlock() + + if wmlen == 0 && rmlen == 0 { + // Cleanup happened, test can exit successfully. + break + } + if time.Now().After(deadline) { + t.Fatalf("cleanup of interned regexps did not happen in time") + } + time.Sleep(1 * time.Second) + } + }) + } +} + +// TestCleanupCorrectKind tests that the cleanup function removes the correct +// kind (POSIX or non-POSIX) of regular expression from the correct backing maps. +func TestCleanupCorrectKind(t *testing.T) { + pattern := ".*" + // Compile a POSIX and a non-POSIX regular expression + posixRegexp, err := CompilePOSIXInterned(pattern) + require.NoError(t, err) + nonPosixRegexp, err := CompileInterned(pattern) + require.NoError(t, err) + + // Ensure they are different pointers + require.NotEqual(t, posixRegexp, nonPosixRegexp) + + // Manually run the cleanup function for the POSIX regular expression + cleanupCollectedPointers(posixWeakMap[pattern], posixWeakMap) + + // Ensure that the POSIX regular expression was removed from the maps + l.Lock() + require.Len(t, posixWeakMap, 0) + require.Len(t, reverseMap, 1) + require.Len(t, weakMap, 1) + l.Unlock() + + // Compile a new POSIX regular expression with the same pattern + posixRegexp, err = CompilePOSIXInterned(pattern) + require.NoError(t, err) + + l.Lock() + require.Len(t, posixWeakMap, 1) + require.Len(t, reverseMap, 2) + require.Len(t, weakMap, 1) + l.Unlock() + + // Manually run the cleanup function for the non-POSIX regular expression + cleanupCollectedPointers(weakMap[pattern], weakMap) + l.Lock() + require.Len(t, weakMap, 0) + require.Len(t, reverseMap, 1) + require.Len(t, posixWeakMap, 1) + l.Unlock() +} + +func BenchmarkRegexps(b *testing.B) { + s := make([]*regexp.Regexp, b.N) + for i := 0; i < b.N; i++ { + s[i] = regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) + } +} + +func BenchmarkInternedRegexps(b *testing.B) { + s := make([]*regexp.Regexp, b.N) + for i := 0; i < b.N; i++ { + s[i] = MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`) + } +} + +func BenchmarkConcurrentRegexps(b *testing.B) { + var wg sync.WaitGroup + for j := 0; j < runtime.NumCPU(); j++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < b.N; i++ { + regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) + } + }() + } + wg.Wait() +} + +func BenchmarkConcurrentInternedRegexps(b *testing.B) { + var wg sync.WaitGroup + for j := 0; j < runtime.NumCPU(); j++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < b.N; i++ { + MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j)) + } + }() + } + wg.Wait() +}