Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
439bf50
Drop in replacement for the Go regexp package that does temporal cach…
sgmiller Aug 30, 2024
c35e29d
Weak value version: Implements a poor man's weak valued map, which c…
sgmiller Sep 3, 2024
84aceb1
fix locking semantics
sgmiller Sep 3, 2024
892773d
fix locking in other func, and use the right base maps
sgmiller Sep 3, 2024
cf697c9
Rename to be distinct from the original functions and thus not as sub…
sgmiller Sep 3, 2024
bda7c56
Simplify locking, add benchmarks
sgmiller Sep 3, 2024
fb4f51d
remove unneeded deps
sgmiller Feb 18, 2025
006c382
Refactor the code to use weak pointers for weak map implementation
kubawi Feb 19, 2025
b2aa9bd
If a weak pointer is nil, remove the entry from maps
kubawi Feb 19, 2025
d583b64
Add regexp to the module list in the GitHub Actions go.yml workflow f…
kubawi Feb 20, 2025
199d464
Update the Go directive in go.mod to version 1.24, so that it's clear…
kubawi Feb 20, 2025
071bc6f
Swap RWMutex for a Mutex
kubawi Feb 20, 2025
7eaf9de
Tidying up based on PR feedback; rename some variables to fix shadowi…
kubawi Feb 20, 2025
0230859
Fix the formatting of go.yml to resolve a git conflict with main branch
kubawi Feb 20, 2025
d99b0b1
Refactor mustCompile to wrap compile in a closure to reduce code dupl…
kubawi Feb 20, 2025
72b29d1
Merge branch 'main' into sgm/regexp-caching
kubawi Feb 20, 2025
9c844f2
Tidy up imports in regexp/regexp_test.go
kubawi Feb 20, 2025
b21b980
Refactor compile and mustCompile to only delete the pointers from the…
kubawi Feb 21, 2025
ea0066a
Rename cleanup to cleanupCollectedPointers and refactor it to only de…
kubawi Feb 21, 2025
0b4566a
Add a retry mechanism with a timeout to TestInternedRegexps
kubawi Feb 24, 2025
5d9a94a
Add a test that ensures that the cleanup function does not confuse PO…
kubawi Feb 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
- "plugincontainer"
- "pluginutil"
- "random"
- "regexp"
- "reloadutil"
- "strutil"
- "temperror"
Expand Down
11 changes: 11 additions & 0 deletions regexp/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module github.com/hashicorp/go-secure-stdlib/regexp

go 1.24

require github.com/stretchr/testify v1.10.0

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
10 changes: 10 additions & 0 deletions regexp/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
103 changes: 103 additions & 0 deletions regexp/regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package regexp

import (
"regexp"
"runtime"
"sync"
"weak"
)

// Interns the compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result
// is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage.
//
// Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming
// unreachable, they will be eventually removed from the map and memory reclaimed.

var (
weakMap = make(map[string]weak.Pointer[regexp.Regexp])
posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp])
reverseMap = make(map[weak.Pointer[regexp.Regexp]]string)
l sync.Mutex
)

// CompileInterned compiles and interns a regular expression and returns a
// pointer to it or an error.
func CompileInterned(pattern string) (*regexp.Regexp, error) {
return compile(pattern, regexp.Compile, weakMap)
}

// CompilePOSIXInterned compiles and interns a regular expression using POSIX
// syntax and returns a pointer to it or an error.
func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) {
return compile(pattern, regexp.CompilePOSIX, posixWeakMap)
}

// MustCompileInterned compiles and interns a regular expression and returns a pointer to it or panics.
func MustCompileInterned(pattern string) *regexp.Regexp {
return mustCompile(pattern, regexp.MustCompile, weakMap)
}

// MustCompilePOSIXInterned compiles and interns a regular expression using
// POSIX syntax and returns a pointer to it or panics.
func MustCompilePOSIXInterned(pattern string) *regexp.Regexp {
return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap)
}

// compile handles compiling and interning regular expressions. If the regexp is
// already interned, a pointer to it is returned from the map. If the regexp is
// not interned or is nil (since it's a weak pointer), it is compiled and stored
// in the maps. The regexp is stored in the maps as a weak pointer, so that it
// can be garbage collected.
func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), internedPointers map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) {
l.Lock()
Comment thread
kubawi marked this conversation as resolved.
defer l.Unlock()
if itemPtr, ok := internedPointers[pattern]; ok {
ptr := itemPtr.Value()
if ptr != nil {
return ptr, nil
}
delete(internedPointers, pattern)
delete(reverseMap, itemPtr)
}
r, err := compileFunc(pattern)
if err != nil {
return nil, err
}
weakPointer := weak.Make(r)
internedPointers[pattern] = weakPointer
reverseMap[weakPointer] = pattern

// Register a cleanup function for the regexp object
cleanup := func(ptr weak.Pointer[regexp.Regexp]) {
cleanupCollectedPointers(ptr, internedPointers)
}
runtime.AddCleanup(r, cleanup, weakPointer)

return r, nil
}

// mustCompile is a wrapper around compile that is used when we want to panic
// instead of returning an error. If the regexp is already interned, a pointer
// to it is returned from the map. If the regexp is not interned or is nil
// (since it's a weak pointer), it is compiled and stored in the maps. The
// regexp is stored in the maps as a weak pointer, so that it can be garbage
// collected.
func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, internedPointers map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp {
compileFunc := func(string) (*regexp.Regexp, error) {
return mustCompileFunc(pattern), nil
}
res, _ := compile(pattern, compileFunc, internedPointers)
return res
}

// cleanupCollectedPointers is a cleanup function for *regexp.Regexp. It removes
// the entries from relevant maps when the regexp object they point to is
// garbage collected.
func cleanupCollectedPointers(ptr weak.Pointer[regexp.Regexp], internedPointers map[string]weak.Pointer[regexp.Regexp]) {
l.Lock()
defer l.Unlock()
if s, ok := reverseMap[ptr]; ok {
delete(internedPointers, s)
delete(reverseMap, ptr)
}
}
181 changes: 181 additions & 0 deletions regexp/regexp_test.go
Comment thread
kubawi marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package regexp

import (
"regexp"
"runtime"
"strconv"
"sync"
"testing"
Comment thread
kubawi marked this conversation as resolved.
"time"

"github.com/stretchr/testify/require"
)

// TestInternedRegexps tests that the regular expressions are compiled correctly,
// are interned, and that the cleanup of interned regexps works as expected.
//
// Since this test depends on the garbage collector, it is not really
// deterministic and might flake in the future. If that happens, the calls to
// the garbage collector and the rest of the test should be removed.
func TestInternedRegexps(t *testing.T) {
testCases := map[string]struct {
compileFunc func(string) (*regexp.Regexp, error)
mustCompileFunc func(string) *regexp.Regexp
mustCompile bool
}{
"CompileInterned": {
compileFunc: CompileInterned,
mustCompileFunc: nil,
mustCompile: false,
},
"MustCompileInterned": {
compileFunc: nil,
mustCompileFunc: MustCompileInterned,
mustCompile: true,
},
"CompilePOSIXInterned": {
compileFunc: CompilePOSIXInterned,
mustCompileFunc: nil,
mustCompile: false,
},
"MustCompilePOSIXInterned": {
compileFunc: nil,
mustCompileFunc: MustCompilePOSIXInterned,
mustCompile: true,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
// Compile two identical regular expressions, their pointers should be the same.
var r1, r2 *regexp.Regexp
var err error
pattern := ".*"
if tc.mustCompile {
r1 = tc.mustCompileFunc(pattern)
r2 = tc.mustCompileFunc(pattern)
} else {
r1, err = tc.compileFunc(pattern)
require.NoError(t, err)
r2, err = tc.compileFunc(pattern)
require.NoError(t, err)

// While we're here, check that errors work as expected.
_, err = tc.compileFunc("(")
require.Error(t, err)
}
require.True(t, r1 == r2)

// Remove references to the regexps, and run the garbage collector in a loop to see if the cleanup happens.
r1 = nil
r2 = nil
deadline := time.Now().Add(10 * time.Second)
for {
// Run the garbage collector twice to increase chances of the cleanup happening.
// This still doesn't make it deterministic, but in local testing it was enough
// to not flake a single time in over two million runs, so it should be good enough.
// A single call to runtime.GC() was flaking very frequently in local testing.
runtime.GC()
runtime.GC()

// Ensure that the cleanup happened and the maps used for interning regexp are empty.
l.Lock()
wmlen := len(weakMap)
rmlen := len(reverseMap)
l.Unlock()

if wmlen == 0 && rmlen == 0 {
// Cleanup happened, test can exit successfully.
break
}
if time.Now().After(deadline) {
t.Fatalf("cleanup of interned regexps did not happen in time")
}
time.Sleep(1 * time.Second)
}
})
}
}

// TestCleanupCorrectKind tests that the cleanup function removes the correct
// kind (POSIX or non-POSIX) of regular expression from the correct backing maps.
func TestCleanupCorrectKind(t *testing.T) {
pattern := ".*"
// Compile a POSIX and a non-POSIX regular expression
posixRegexp, err := CompilePOSIXInterned(pattern)
require.NoError(t, err)
nonPosixRegexp, err := CompileInterned(pattern)
require.NoError(t, err)

// Ensure they are different pointers
require.NotEqual(t, posixRegexp, nonPosixRegexp)

// Manually run the cleanup function for the POSIX regular expression
cleanupCollectedPointers(posixWeakMap[pattern], posixWeakMap)

// Ensure that the POSIX regular expression was removed from the maps
l.Lock()
require.Len(t, posixWeakMap, 0)
require.Len(t, reverseMap, 1)
require.Len(t, weakMap, 1)
l.Unlock()

// Compile a new POSIX regular expression with the same pattern
posixRegexp, err = CompilePOSIXInterned(pattern)
require.NoError(t, err)

l.Lock()
require.Len(t, posixWeakMap, 1)
require.Len(t, reverseMap, 2)
require.Len(t, weakMap, 1)
l.Unlock()

// Manually run the cleanup function for the non-POSIX regular expression
cleanupCollectedPointers(weakMap[pattern], weakMap)
l.Lock()
require.Len(t, weakMap, 0)
require.Len(t, reverseMap, 1)
require.Len(t, posixWeakMap, 1)
l.Unlock()
}

func BenchmarkRegexps(b *testing.B) {
s := make([]*regexp.Regexp, b.N)
for i := 0; i < b.N; i++ {
s[i] = regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`)
}
}

func BenchmarkInternedRegexps(b *testing.B) {
s := make([]*regexp.Regexp, b.N)
for i := 0; i < b.N; i++ {
s[i] = MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`)
}
}

func BenchmarkConcurrentRegexps(b *testing.B) {
var wg sync.WaitGroup
for j := 0; j < runtime.NumCPU(); j++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < b.N; i++ {
regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j))
}
}()
}
wg.Wait()
}

func BenchmarkConcurrentInternedRegexps(b *testing.B) {
var wg sync.WaitGroup
for j := 0; j < runtime.NumCPU(); j++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < b.N; i++ {
MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j))
}
}()
}
wg.Wait()
}