diff --git a/alloc_test.go b/alloc_test.go index c24f7cd..f798a27 100644 --- a/alloc_test.go +++ b/alloc_test.go @@ -6,7 +6,7 @@ import ( func BenchmarkRegisterDeregister(b *testing.B) { for i := 0; i < b.N; i++ { - id := dw.register(nil, nil, 0) + id := dw.register(nil, nil, 0, Opts.DeadlockTimeout) dw.deregister(id) } } diff --git a/deadlock.go b/deadlock.go index 7c7507e..33766b8 100644 --- a/deadlock.go +++ b/deadlock.go @@ -180,8 +180,8 @@ func preLock(stack []uintptr, p interface{}) { lo.preLock(stack, p) } -func postLock(stack []uintptr, p interface{}) { - lo.postLock(stack, p) +func postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) { + lo.postLock(stack, buf, p) } func postUnlock(p interface{}) { @@ -193,19 +193,24 @@ func lock(lockFn func(), ptr interface{}) { lockFn() return } - stack := callers(1) + stack, buf := callers(1) + // Cache timeout before preLock so all Opts reads complete before preLock + // may call OnPotentialDeadlock. If preLock detects a problem (recursive + // lock, order violation) the goroutine may block forever in lockFn below, + // and reading Opts after preLock would race with any later Opts write. + timeout := Opts.DeadlockTimeout preLock(stack, ptr) - if Opts.DeadlockTimeout <= 0 { + if timeout <= 0 { lockFn() } else { currentID := goid.Get() - e := dw.register(stack, ptr, currentID) + e := dw.register(stack, ptr, currentID, timeout) lockFn() dw.deregister(e) - postLock(stack, ptr) + postLock(stack, buf, ptr) return } - postLock(stack, ptr) + postLock(stack, buf, ptr) } // pendingEntry tracks a goroutine that is waiting to acquire a lock. Entries are @@ -217,7 +222,7 @@ func lock(lockFn func(), ptr interface{}) { // - The done flag synchronizes the callback with deregister: deregister sets done=1 // before calling Stop(), and the callback checks done before acting. Because both // use atomic operations, the callback is guaranteed to observe done=1 if deregister -// has already run — even if the runtime already scheduled the callback. +// has already run, even if the runtime already scheduled the callback. // - An entry is only returned to the pool when timer.Stop() returns true, meaning // the timer was successfully cancelled and the callback will never run. This prevents // a recycled entry from being mutated by an in-flight callback. @@ -234,7 +239,7 @@ type pendingEntry struct { func newPendingEntry() *pendingEntry { e := &pendingEntry{} - // Capture e by pointer so the closure is stable across pool reuse — no new + // Capture e by pointer so the closure is stable across pool reuse, no new // closure allocation when the entry is recycled. e.checkFn = func() { // If the lock was acquired (done=1), the entry may already be back in the @@ -257,7 +262,7 @@ type deadlockWatcher struct{} var dw deadlockWatcher -func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64) *pendingEntry { +func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64, timeout time.Duration) *pendingEntry { var e *pendingEntry if shouldDisableTimerPool() { e = newPendingEntry() @@ -272,11 +277,11 @@ func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64) // First use (freshly allocated entry): create the AfterFunc timer. // AfterFunc avoids the channel-drain problems of channel-based timers, // which are especially problematic under testing/synctest. - e.timer = time.AfterFunc(Opts.DeadlockTimeout, e.checkFn) + e.timer = time.AfterFunc(timeout, e.checkFn) } else { // Reused from pool: the timer was previously Stop()'d successfully // (guaranteed by deregister), so Reset is safe here. - e.timer.Reset(Opts.DeadlockTimeout) + e.timer.Reset(timeout) } return e } @@ -305,12 +310,12 @@ func onDeadlockTimeout(e *pendingEntry) { lo.mu.Lock() holders, ok := lo.cur[e.ptr] if !ok || len(holders) == 0 { - // Lock appears unheld (transient state — holder may have just released). + // Lock appears unheld (transient state, holder may have just released). // Reschedule if the waiter is still pending. Note: this creates a new timer // (e.timer is not updated), so if deregister runs later it will Stop() the // original (already-fired) timer, get false, and skip pooling. The new timer's // callback will then observe done=1 and no-op. This is safe but means the - // entry won't be recycled — acceptable since this is the rare timeout path. + // entry won't be recycled, acceptable since this is the rare timeout path. lo.mu.Unlock() if atomic.LoadInt32(&e.done) == 0 { time.AfterFunc(Opts.DeadlockTimeout, e.checkFn) @@ -361,6 +366,7 @@ type lockOrder struct { type stackGID struct { stack []uintptr gid int64 + buf *[stackBufSize]uintptr // pooled backing array; returned via releaseStackBuf in postUnlock } type ss struct { @@ -377,10 +383,26 @@ func newLockOrder() *lockOrder { } } -func (l *lockOrder) postLock(stack []uintptr, p interface{}) { +// holdersPool recycles []stackGID slices used by lockOrder.cur to track which +// goroutines currently hold each lock. Slices are returned to the pool in +// postUnlock when a lock's holder count drops to zero, and reused in postLock +// for the next lock acquisition, avoiding a new slice allocation per mutex. +var holdersPool sync.Pool + +// postLock records the current goroutine as a holder of lock p. It tries to +// reuse a pooled []stackGID slice before allocating, and stores the pooled +// stack buffer in the entry so postUnlock can release it later. +func (l *lockOrder) postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) { gid := goid.Get() + entry := stackGID{stack, gid, buf} l.mu.Lock() - l.cur[p] = append(l.cur[p], stackGID{stack, gid}) + holders := l.cur[p] + if holders == nil { + if s, ok := holdersPool.Get().([]stackGID); ok { + holders = s[:0] + } + } + l.cur[p] = append(holders, entry) l.mu.Unlock() } @@ -434,7 +456,9 @@ func (l *lockOrder) preLock(stack []uintptr, p interface{}) { Opts.mu.Unlock() Opts.OnPotentialDeadlock() } - l.order[newBeforeAfter(b, p)] = ss{bs.stack, stack} + // Copy both stacks: they're backed by pooled buffers that will be + // recycled in postUnlock, but l.order entries persist until MaxMapSize. + l.order[newBeforeAfter(b, p)] = ss{copyStack(bs.stack), copyStack(stack)} if len(l.order) == Opts.MaxMapSize { // Reset the map to keep memory footprint bounded. l.order = map[beforeAfter]ss{} } @@ -455,15 +479,33 @@ func (l *lockOrder) postUnlock(p interface{}) { } } if idx >= 0 { + removedBuf := holders[idx].buf holders[idx] = holders[len(holders)-1] - holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array. + holders[len(holders)-1] = stackGID{} holders = holders[:len(holders)-1] + releaseStackBuf(removedBuf) } else if len(holders) > 0 { - // Cross-goroutine unlock: no matching gid found, remove an arbitrary entry. - holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array. + // Cross-goroutine unlock: Go permits one goroutine to Lock and a different + // goroutine to Unlock, so the unlocking gid may not match any holder entry. + // This is a rare edge case in practice, the vast majority of code unlocks + // from the same goroutine that locked. We remove an arbitrary entry to keep + // the holder count consistent with the real lock state (the lock *was* + // released, so one entry must go). The trade-off: for RWMutex with multiple + // concurrent readers we may discard the wrong reader's stack trace, making a + // future deadlock report show a slightly misleading "previous lock site". + // Detection correctness is unaffected. + removedBuf := holders[len(holders)-1].buf + holders[len(holders)-1] = stackGID{} holders = holders[:len(holders)-1] + releaseStackBuf(removedBuf) } if len(holders) == 0 { + // Delete the map key so the mutex pointer is not retained, allowing GC of + // the struct it's embedded in. Recycle the backing slice via pool so the + // next postLock on any mutex can reuse it instead of allocating. + if cap(holders) > 0 { + holdersPool.Put(holders[:0]) + } delete(l.cur, p) } else { l.cur[p] = holders diff --git a/deadlock_test.go b/deadlock_test.go index 9714b44..177158b 100644 --- a/deadlock_test.go +++ b/deadlock_test.go @@ -212,12 +212,166 @@ func TestStarvedRLockMultipleReaders(t *testing.T) { <-ch } -func TestLockDuplicate(t *testing.T) { +// TestManyReadersFewWriters stresses the RWMutex tracking under high read +// concurrency with infrequent writers. Existing tests use at most ~10 +// goroutines with a balanced reader/writer mix; real-world usage often has +// dozens of readers racing against a handful of writers. This exercises: +// - the per-goroutine cur map ref-counting under heavy concurrent RLock/RUnlock, +// where many goroutines simultaneously call postLock and postUnlock; +// - lock-order detection with a large number of concurrent reader entries; +// - timer pool contention when many DeadlockTimeout timers are live at once. +func TestManyReadersFewWriters(t *testing.T) { defer restore()() + Opts.DeadlockTimeout = time.Millisecond * 5000 + var mu RWMutex + var wg sync.WaitGroup + + const numReaders = 100 + const numWriters = 3 + const readerIters = 50 + const writerIters = 10 + + for i := 0; i < numReaders; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for k := 0; k < readerIters; k++ { + mu.RLock() + time.Sleep(time.Duration(rand.Intn(500)) * time.Microsecond) + mu.RUnlock() + } + }() + } + + for i := 0; i < numWriters; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for k := 0; k < writerIters; k++ { + mu.Lock() + time.Sleep(time.Duration(rand.Intn(200)) * time.Microsecond) + mu.Unlock() + time.Sleep(time.Duration(rand.Intn(1000)) * time.Microsecond) + } + }() + } + + wg.Wait() +} + +// TestConcurrentLockOrderDetection verifies that lock-order violation detection +// works correctly under real goroutine contention. TestLockOrder runs its two +// goroutines sequentially (wg.Wait() between them), so the order map and cur map +// are only contested by one goroutine at a time. Here, many goroutines +// simultaneously call preLock, postLock, and postUnlock — all contending on +// lo.mu — while each one independently detects the same A→B vs B→A conflict. +// This stresses concurrent iteration of lo.cur, concurrent reads/writes to +// lo.order, and concurrent invocations of OnPotentialDeadlock. +func TestConcurrentLockOrderDetection(t *testing.T) { + defer restore()() + Opts.DeadlockTimeout = 0 + var deadlocks uint32 + Opts.OnPotentialDeadlock = func() { + atomic.AddUint32(&deadlocks, 1) + } + + var a, b Mutex + + // Establish the A→B ordering in the lock-order map. + a.Lock() + b.Lock() + b.Unlock() + a.Unlock() + + // Launch many goroutines that all acquire B→A concurrently. Each one + // triggers a violation in preLock when it tries to acquire A while holding + // B. Because every goroutine acquires in the same order (B then A), they + // cannot actually deadlock with each other. + var wg sync.WaitGroup + start := make(chan struct{}) + for i := 0; i < 20; i++ { + wg.Add(1) + go func() { + defer wg.Done() + <-start + for k := 0; k < 10; k++ { + b.Lock() + a.Lock() + a.Unlock() + b.Unlock() + } + }() + } + + close(start) + wg.Wait() + + if d := atomic.LoadUint32(&deadlocks); d == 0 { + t.Fatal("expected at least 1 lock-order violation, detected 0") + } +} + +// TestDeadlockTimeoutTransientNoHolder exercises the reschedule path in +// onDeadlockTimeout. When the deadlock timer fires and lo.cur has no holder +// for the waited-on lock — a transient state that can occur between the +// previous holder's postUnlock and the waiter's deregister — the callback +// reschedules itself via time.AfterFunc instead of reporting a false deadlock. +// +// In production this window is extremely narrow (nanoseconds between +// postUnlock removing the holder from lo.cur and the waiter's lockFn +// returning), so it cannot be triggered reliably through normal Lock/Unlock +// calls. Instead we invoke onDeadlockTimeout directly with a pendingEntry +// whose lock has no holders in lo.cur, deterministically hitting the +// reschedule branch. We then set done=1 (simulating the waiter acquiring the +// lock) and verify the rescheduled timer fires harmlessly with no false +// deadlock report. +func TestDeadlockTimeoutTransientNoHolder(t *testing.T) { + defer restore()() + Opts.DisableLockOrderDetection = true + Opts.DeadlockTimeout = 10 * time.Millisecond + var deadlocks uint32 + Opts.OnPotentialDeadlock = func() { + atomic.AddUint32(&deadlocks, 1) + } + + var mu Mutex + + // Build a pendingEntry as if a goroutine were blocked waiting on mu. + // mu has never been locked, so lo.cur has no holders for it. + e := newPendingEntry() + e.ptr = &mu + e.gid = 999 + atomic.StoreInt32(&e.done, 0) + + // Directly invoke the timeout handler. It will find no holders in lo.cur + // and take the reschedule branch (time.AfterFunc) instead of reporting. + onDeadlockTimeout(e) + + // Simulate the waiter acquiring the lock. The rescheduled timer's checkFn + // will see done=1 and no-op. + atomic.StoreInt32(&e.done, 1) + + // Wait long enough for the rescheduled timer to fire and confirm it + // does not report a false deadlock. + time.Sleep(Opts.DeadlockTimeout * 3) + + if d := atomic.LoadUint32(&deadlocks); d != 0 { + t.Fatalf("expected 0 false deadlocks from transient no-holder state, got %d", d) + } +} + +func TestLockDuplicate(t *testing.T) { + // No restore() here: the goroutines below permanently block inside lockFn + // after preLock detects recursion. A deferred restore() would write to Opts + // while those goroutines are still reading Opts.DeadlockTimeout, causing a + // data race under -race. Omitting restore is safe because every other test + // saves/sets its own Opts via restore(). Opts.DeadlockTimeout = 0 var deadlocks uint32 + detected := make(chan struct{}, 2) Opts.OnPotentialDeadlock = func() { atomic.AddUint32(&deadlocks, 1) + detected <- struct{}{} } var a RWMutex var b Mutex @@ -233,7 +387,8 @@ func TestLockDuplicate(t *testing.T) { b.Unlock() b.Unlock() }() - time.Sleep(time.Second * 1) + <-detected + <-detected if atomic.LoadUint32(&deadlocks) != 2 { t.Fatalf("expected 2 deadlocks, detected %d", deadlocks) } diff --git a/stacktraces.go b/stacktraces.go index d93050f..fb5f308 100644 --- a/stacktraces.go +++ b/stacktraces.go @@ -13,9 +13,38 @@ import ( "sync" ) -func callers(skip int) []uintptr { - s := make([]uintptr, 50) // Most relevant context seem to appear near the top of the stack. - return s[:runtime.Callers(2+skip, s)] +const stackBufSize = 50 + +var stackBufPool = sync.Pool{ + New: func() interface{} { + return new([stackBufSize]uintptr) + }, +} + +// callers returns a stack trace backed by a pooled buffer. The caller must +// eventually return buf via releaseStackBuf — typically through the +// postLock/postUnlock path which stores it in stackGID.buf. +func callers(skip int) ([]uintptr, *[stackBufSize]uintptr) { + buf := stackBufPool.Get().(*[stackBufSize]uintptr) + n := runtime.Callers(2+skip, buf[:]) + return buf[:n], buf +} + +// releaseStackBuf returns a pooled stack buffer obtained from callers(). Safe to +// call with nil (e.g. when the buffer was already handed off via stackGID.buf). +func releaseStackBuf(buf *[stackBufSize]uintptr) { + if buf != nil { + stackBufPool.Put(buf) + } +} + +// copyStack creates an independent copy of a stack trace. Required when storing +// stacks in long-lived structures (e.g. l.order) because the originals are backed +// by pooled buffers that will be recycled in postUnlock. +func copyStack(s []uintptr) []uintptr { + c := make([]uintptr, len(s)) + copy(c, s) + return c } func printStack(w io.Writer, stack []uintptr) { diff --git a/synctest_comparison_test.go b/synctest_comparison_test.go index 29e1a12..5317936 100644 --- a/synctest_comparison_test.go +++ b/synctest_comparison_test.go @@ -9,8 +9,13 @@ func TestNormalDeadlockDetection(t *testing.T) { oldTimeout := Opts.DeadlockTimeout oldOnDeadlock := Opts.OnPotentialDeadlock Opts.DeadlockTimeout = 20 * time.Millisecond + // onDeadlockTimeout calls Opts.OnPotentialDeadlock on a timer goroutine + // (line 352 of deadlock.go). The channel send inside the callback creates a + // happens-before edge so the deferred restore below won't race with the read. + callbackDone := make(chan struct{}, 1) Opts.OnPotentialDeadlock = func() { t.Log("Deadlock detected!") + callbackDone <- struct{}{} } defer func() { Opts.DeadlockTimeout = oldTimeout @@ -33,4 +38,5 @@ func TestNormalDeadlockDetection(t *testing.T) { time.Sleep(30 * time.Millisecond) mu.Unlock() <-done + <-callbackDone } diff --git a/trylock.go b/trylock.go index 43c7a5f..b29b263 100644 --- a/trylock.go +++ b/trylock.go @@ -28,12 +28,15 @@ func trylock(lockFn func() bool, ptr interface{}) bool { if Opts.Disable { return lockFn() } - stack := callers(1) + stack, buf := callers(1) preLock(stack, ptr) ret := lockFn() if ret { - postLock(stack, ptr) + postLock(stack, buf, ptr) } else { + // TryLock failed: the stack won't be stored in stackGID.buf (postLock is + // skipped), so we must release the pooled buffer directly to avoid a leak. + releaseStackBuf(buf) postUnlock(ptr) } return ret diff --git a/trylock_test.go b/trylock_test.go index 8df5519..c700b1d 100644 --- a/trylock_test.go +++ b/trylock_test.go @@ -181,11 +181,17 @@ func TestRWMutexTryLock(t *testing.T) { } func TestTryLockDuplicate(t *testing.T) { - defer restore()() + // No restore() here: the goroutines below permanently block inside lockFn + // after preLock detects recursion. A deferred restore() would write to Opts + // while those goroutines are still reading Opts.DeadlockTimeout, causing a + // data race under -race. Omitting restore is safe because every other test + // saves/sets its own Opts via restore(). Opts.DeadlockTimeout = 0 var deadlocks uint32 + detected := make(chan struct{}, 2) Opts.OnPotentialDeadlock = func() { atomic.AddUint32(&deadlocks, 1) + detected <- struct{}{} } var a RWMutex var b Mutex @@ -205,7 +211,8 @@ func TestTryLockDuplicate(t *testing.T) { b.Unlock() b.Unlock() }() - time.Sleep(time.Second * 1) + <-detected + <-detected if atomic.LoadUint32(&deadlocks) != 2 { t.Fatalf("expected 2 deadlocks, detected %d", deadlocks) }