Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
439bf50
Drop in replacement for the Go regexp package that does temporal cach…
sgmiller Aug 30, 2024
c35e29d
Weak value version: Implements a poor man's weak valued map, which c…
sgmiller Sep 3, 2024
84aceb1
fix locking semantics
sgmiller Sep 3, 2024
892773d
fix locking in other func, and use the right base maps
sgmiller Sep 3, 2024
cf697c9
Rename to be distinct from the original functions and thus not as sub…
sgmiller Sep 3, 2024
bda7c56
Simplify locking, add benchmarks
sgmiller Sep 3, 2024
fb4f51d
remove unneeded deps
sgmiller Feb 18, 2025
006c382
Refactor the code to use weak pointers for weak map implementation
kubawi Feb 19, 2025
b2aa9bd
If a weak pointer is nil, remove the entry from maps
kubawi Feb 19, 2025
d583b64
Add regexp to the module list in the GitHub Actions go.yml workflow f…
kubawi Feb 20, 2025
199d464
Update the Go directive in go.mod to version 1.24, so that it's clear…
kubawi Feb 20, 2025
071bc6f
Swap RWMutex for a Mutex
kubawi Feb 20, 2025
7eaf9de
Tidying up based on PR feedback; rename some variables to fix shadowi…
kubawi Feb 20, 2025
0230859
Fix the formatting of go.yml to resolve a git conflict with main branch
kubawi Feb 20, 2025
d99b0b1
Refactor mustCompile to wrap compile in a closure to reduce code dupl…
kubawi Feb 20, 2025
72b29d1
Merge branch 'main' into sgm/regexp-caching
kubawi Feb 20, 2025
9c844f2
Tidy up imports in regexp/regexp_test.go
kubawi Feb 20, 2025
b21b980
Refactor compile and mustCompile to only delete the pointers from the…
kubawi Feb 21, 2025
ea0066a
Rename cleanup to cleanupCollectedPointers and refactor it to only de…
kubawi Feb 21, 2025
0b4566a
Add a retry mechanism with a timeout to TestInternedRegexps
kubawi Feb 24, 2025
5d9a94a
Add a test that ensures that the cleanup function does not confuse PO…
kubawi Feb 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ jobs:
- "plugincontainer"
- "pluginutil"
- "random"
- "regexp"
- "reloadutil"
- "strutil"
- "temperror"
Expand Down
11 changes: 11 additions & 0 deletions regexp/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module github.com/hashicorp/go-secure-stdlib/regexp

go 1.24

require github.com/stretchr/testify v1.10.0

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
10 changes: 10 additions & 0 deletions regexp/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
98 changes: 98 additions & 0 deletions regexp/regexp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package regexp

import (
"regexp"
"runtime"
"sync"
"weak"
)

// Interns the compilation of Regular Expressions. If two regexs with the same pattern are compiled, the result
// is the same *regexp.Regexp. This avoids the compilation cost but more importantly the memory usage.
//
// Regular expressions produced from this package are backed by a form of weak-valued map, upon a regexp becoming
// unreachable, they will be eventually removed from the map and memory reclaimed.

var (
weakMap = make(map[string]weak.Pointer[regexp.Regexp])
posixWeakMap = make(map[string]weak.Pointer[regexp.Regexp])
reverseMap = make(map[weak.Pointer[regexp.Regexp]]string)
l sync.Mutex
)

// CompileInterned compiles and interns a regular expression and returns a
// pointer to it or an error.
func CompileInterned(pattern string) (*regexp.Regexp, error) {
return compile(pattern, regexp.Compile, weakMap)
}

// CompilePOSIXInterned compiles and interns a regular expression using POSIX
// syntax and returns a pointer to it or an error.
func CompilePOSIXInterned(pattern string) (*regexp.Regexp, error) {
return compile(pattern, regexp.CompilePOSIX, posixWeakMap)
}

// MustCompileInterned compiles and interns a regular expression and returns a pointer to it or panics.
func MustCompileInterned(pattern string) *regexp.Regexp {
return mustCompile(pattern, regexp.MustCompile, weakMap)
}

// MustCompilePOSIXInterned compiles and interns a regular expression using
// POSIX syntax and returns a pointer to it or panics.
func MustCompilePOSIXInterned(pattern string) *regexp.Regexp {
return mustCompile(pattern, regexp.MustCompilePOSIX, posixWeakMap)
}

// compile handles compiling and interning regular expressions. If the regexp is
// already interned, a pointer to it is returned from the map. If the regexp is
// not interned or is nil (since it's a weak pointer), it is compiled and stored
// in the maps. The regexp is stored in the maps as a weak pointer, so that it
// can be garbage collected.
func compile(pattern string, compileFunc func(string) (*regexp.Regexp, error), m map[string]weak.Pointer[regexp.Regexp]) (*regexp.Regexp, error) {
l.Lock()
Comment thread
kubawi marked this conversation as resolved.
defer l.Unlock()
if itemPtr, ok := m[pattern]; ok {
ptr := itemPtr.Value()
if ptr != nil {
return ptr, nil
}
delete(m, pattern)
delete(posixWeakMap, pattern)
Comment thread
kubawi marked this conversation as resolved.
Outdated
delete(reverseMap, itemPtr)
}
r, err := compileFunc(pattern)
if err != nil {
return nil, err
}
weakPointer := weak.Make(r)
m[pattern] = weakPointer
reverseMap[weakPointer] = pattern
runtime.AddCleanup(r, cleanup, weakPointer)
return r, nil
}

// mustCompile handles compiling and interning regular expressions just like
// compile, but it will panic instead of returning an error. If the regexp is
// already interned, a pointer to it is returned from the map. If the regexp is
// not interned or is nil (since it's a weak pointer), it is compiled and stored
// in the maps. The regexp is stored in the maps as a weak pointer, so that it
// can be garbage collected.
func mustCompile(pattern string, mustCompileFunc func(string) *regexp.Regexp, weakMap map[string]weak.Pointer[regexp.Regexp]) *regexp.Regexp {
compileFunc := func(string) (*regexp.Regexp, error) {
return mustCompileFunc(pattern), nil
}
res, _ := compile(pattern, compileFunc, weakMap)
return res
}

// cleanup is a cleanup function for *regexp.Regexp. It removes the entries from the
// weak maps when the regexp object they point to is garbage collected.
func cleanup(ptr weak.Pointer[regexp.Regexp]) {
l.Lock()
defer l.Unlock()
if s, ok := reverseMap[ptr]; ok {
delete(weakMap, s)
delete(posixWeakMap, s)
delete(reverseMap, ptr)
}
}
125 changes: 125 additions & 0 deletions regexp/regexp_test.go
Comment thread
kubawi marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package regexp

import (
"github.com/stretchr/testify/require"
"regexp"
"runtime"
"strconv"
"sync"
"testing"
Comment thread
kubawi marked this conversation as resolved.
)

// TestInternedRegexps tests that the regular expressions are compiled correctly,
// are interned, and that the cleanup of interned regexps works as expected.
//
// Since this test depends on the garbage collector, it is not really
// deterministic and might flake in the future. If that happens, the calls to
// the garbage collector and the rest of the test should be removed.
func TestInternedRegexps(t *testing.T) {
testCases := map[string]struct {
compileFunc func(string) (*regexp.Regexp, error)
mustCompileFunc func(string) *regexp.Regexp
mustCompile bool
}{
"CompileInterned": {
compileFunc: CompileInterned,
mustCompileFunc: nil,
mustCompile: false,
},
"MustCompileInterned": {
compileFunc: nil,
mustCompileFunc: MustCompileInterned,
mustCompile: true,
},
"CompilePOSIXInterned": {
compileFunc: CompilePOSIXInterned,
mustCompileFunc: nil,
mustCompile: false,
},
"MustCompilePOSIXInterned": {
compileFunc: nil,
mustCompileFunc: MustCompilePOSIXInterned,
mustCompile: true,
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
// Compile two identical regular expressions, their pointers should be the same
var r1, r2 *regexp.Regexp
var err error
if tc.mustCompile {
r1 = tc.mustCompileFunc(".*")
r2 = tc.mustCompileFunc(".*")
} else {
r1, err = tc.compileFunc(".*")
require.NoError(t, err)
r2, err = tc.compileFunc(".*")
require.NoError(t, err)
require.True(t, r1 == r2)

// While we're here, check that errors work as expected
_, err = tc.compileFunc("(")
require.Error(t, err)
}
require.True(t, r1 == r2)
// Remove references to the regexps and run the garbage collector
r1 = nil
r2 = nil

// Run the garbage collector twice to increase chances of the cleanup happening.
// This still doesn't make it deterministic, but in local testing it was enough
// to not flake a single time in over two million runs, so it should be good enough.
// A single call to runtime.GC() was flaky very frequently in local testing.
runtime.GC()
runtime.GC()

// Ensure that the cleanup happened and the maps used for interning regexp are empty
l.Lock()
require.Len(t, weakMap, 0)
require.Len(t, reverseMap, 0)
l.Unlock()
Comment thread
kubawi marked this conversation as resolved.
Outdated
})
}
}

func BenchmarkRegexps(b *testing.B) {
s := make([]*regexp.Regexp, b.N)
for i := 0; i < b.N; i++ {
s[i] = regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`)
}
}

func BenchmarkInternedRegexps(b *testing.B) {
s := make([]*regexp.Regexp, b.N)
for i := 0; i < b.N; i++ {
s[i] = MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)`)
}
}

func BenchmarkConcurrentRegexps(b *testing.B) {
var wg sync.WaitGroup
for j := 0; j < runtime.NumCPU(); j++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < b.N; i++ {
regexp.MustCompile(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j))
}
}()
}
wg.Wait()
}

func BenchmarkConcurrentInternedRegexps(b *testing.B) {
var wg sync.WaitGroup
for j := 0; j < runtime.NumCPU(); j++ {
wg.Add(1)
go func() {
defer wg.Done()
for i := 0; i < b.N; i++ {
MustCompileInterned(`https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)` + strconv.Itoa(i) + "-" + strconv.Itoa(j))
}
}()
}
wg.Wait()
}