Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion alloc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (

func BenchmarkRegisterDeregister(b *testing.B) {
for i := 0; i < b.N; i++ {
id := dw.register(nil, nil, 0)
id := dw.register(nil, nil, 0, Opts.DeadlockTimeout)
dw.deregister(id)
}
}
Expand Down
82 changes: 62 additions & 20 deletions deadlock.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ func preLock(stack []uintptr, p interface{}) {
lo.preLock(stack, p)
}

func postLock(stack []uintptr, p interface{}) {
lo.postLock(stack, p)
func postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) {
lo.postLock(stack, buf, p)
}

func postUnlock(p interface{}) {
Expand All @@ -193,19 +193,24 @@ func lock(lockFn func(), ptr interface{}) {
lockFn()
return
}
stack := callers(1)
stack, buf := callers(1)
// Cache timeout before preLock so all Opts reads complete before preLock
// may call OnPotentialDeadlock. If preLock detects a problem (recursive
// lock, order violation) the goroutine may block forever in lockFn below,
// and reading Opts after preLock would race with any later Opts write.
timeout := Opts.DeadlockTimeout
preLock(stack, ptr)
if Opts.DeadlockTimeout <= 0 {
if timeout <= 0 {
lockFn()
} else {
currentID := goid.Get()
e := dw.register(stack, ptr, currentID)
e := dw.register(stack, ptr, currentID, timeout)
lockFn()
dw.deregister(e)
postLock(stack, ptr)
postLock(stack, buf, ptr)
return
}
postLock(stack, ptr)
postLock(stack, buf, ptr)
}

// pendingEntry tracks a goroutine that is waiting to acquire a lock. Entries are
Expand All @@ -217,7 +222,7 @@ func lock(lockFn func(), ptr interface{}) {
// - The done flag synchronizes the callback with deregister: deregister sets done=1
// before calling Stop(), and the callback checks done before acting. Because both
// use atomic operations, the callback is guaranteed to observe done=1 if deregister
// has already run even if the runtime already scheduled the callback.
// has already run, even if the runtime already scheduled the callback.
// - An entry is only returned to the pool when timer.Stop() returns true, meaning
// the timer was successfully cancelled and the callback will never run. This prevents
// a recycled entry from being mutated by an in-flight callback.
Expand All @@ -234,7 +239,7 @@ type pendingEntry struct {

func newPendingEntry() *pendingEntry {
e := &pendingEntry{}
// Capture e by pointer so the closure is stable across pool reuse no new
// Capture e by pointer so the closure is stable across pool reuse, no new
// closure allocation when the entry is recycled.
e.checkFn = func() {
// If the lock was acquired (done=1), the entry may already be back in the
Expand All @@ -257,7 +262,7 @@ type deadlockWatcher struct{}

var dw deadlockWatcher

func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64) *pendingEntry {
func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64, timeout time.Duration) *pendingEntry {
var e *pendingEntry
if shouldDisableTimerPool() {
e = newPendingEntry()
Expand All @@ -272,11 +277,11 @@ func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64)
// First use (freshly allocated entry): create the AfterFunc timer.
// AfterFunc avoids the channel-drain problems of channel-based timers,
// which are especially problematic under testing/synctest.
e.timer = time.AfterFunc(Opts.DeadlockTimeout, e.checkFn)
e.timer = time.AfterFunc(timeout, e.checkFn)
} else {
// Reused from pool: the timer was previously Stop()'d successfully
// (guaranteed by deregister), so Reset is safe here.
e.timer.Reset(Opts.DeadlockTimeout)
e.timer.Reset(timeout)
}
return e
}
Expand Down Expand Up @@ -305,12 +310,12 @@ func onDeadlockTimeout(e *pendingEntry) {
lo.mu.Lock()
holders, ok := lo.cur[e.ptr]
if !ok || len(holders) == 0 {
// Lock appears unheld (transient state holder may have just released).
// Lock appears unheld (transient state, holder may have just released).
// Reschedule if the waiter is still pending. Note: this creates a new timer
// (e.timer is not updated), so if deregister runs later it will Stop() the
// original (already-fired) timer, get false, and skip pooling. The new timer's
// callback will then observe done=1 and no-op. This is safe but means the
// entry won't be recycled acceptable since this is the rare timeout path.
// entry won't be recycled, acceptable since this is the rare timeout path.
lo.mu.Unlock()
if atomic.LoadInt32(&e.done) == 0 {
time.AfterFunc(Opts.DeadlockTimeout, e.checkFn)
Expand Down Expand Up @@ -361,6 +366,7 @@ type lockOrder struct {
type stackGID struct {
stack []uintptr
gid int64
buf *[stackBufSize]uintptr // pooled backing array; returned via releaseStackBuf in postUnlock
}

type ss struct {
Expand All @@ -377,10 +383,26 @@ func newLockOrder() *lockOrder {
}
}

func (l *lockOrder) postLock(stack []uintptr, p interface{}) {
// holdersPool recycles []stackGID slices used by lockOrder.cur to track which
// goroutines currently hold each lock. Slices are returned to the pool in
// postUnlock when a lock's holder count drops to zero, and reused in postLock
// for the next lock acquisition, avoiding a new slice allocation per mutex.
var holdersPool sync.Pool

// postLock records the current goroutine as a holder of lock p. It tries to
// reuse a pooled []stackGID slice before allocating, and stores the pooled
// stack buffer in the entry so postUnlock can release it later.
func (l *lockOrder) postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) {
gid := goid.Get()
entry := stackGID{stack, gid, buf}
l.mu.Lock()
l.cur[p] = append(l.cur[p], stackGID{stack, gid})
holders := l.cur[p]
if holders == nil {
if s, ok := holdersPool.Get().([]stackGID); ok {
holders = s[:0]
}
}
l.cur[p] = append(holders, entry)
l.mu.Unlock()
}

Expand Down Expand Up @@ -434,7 +456,9 @@ func (l *lockOrder) preLock(stack []uintptr, p interface{}) {
Opts.mu.Unlock()
Opts.OnPotentialDeadlock()
}
l.order[newBeforeAfter(b, p)] = ss{bs.stack, stack}
// Copy both stacks: they're backed by pooled buffers that will be
// recycled in postUnlock, but l.order entries persist until MaxMapSize.
l.order[newBeforeAfter(b, p)] = ss{copyStack(bs.stack), copyStack(stack)}
if len(l.order) == Opts.MaxMapSize { // Reset the map to keep memory footprint bounded.
l.order = map[beforeAfter]ss{}
}
Expand All @@ -455,15 +479,33 @@ func (l *lockOrder) postUnlock(p interface{}) {
}
}
if idx >= 0 {
removedBuf := holders[idx].buf
holders[idx] = holders[len(holders)-1]
holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array.
holders[len(holders)-1] = stackGID{}
holders = holders[:len(holders)-1]
releaseStackBuf(removedBuf)
} else if len(holders) > 0 {
// Cross-goroutine unlock: no matching gid found, remove an arbitrary entry.
holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array.
// Cross-goroutine unlock: Go permits one goroutine to Lock and a different
// goroutine to Unlock, so the unlocking gid may not match any holder entry.
// This is a rare edge case in practice, the vast majority of code unlocks
// from the same goroutine that locked. We remove an arbitrary entry to keep
// the holder count consistent with the real lock state (the lock *was*
// released, so one entry must go). The trade-off: for RWMutex with multiple
// concurrent readers we may discard the wrong reader's stack trace, making a
// future deadlock report show a slightly misleading "previous lock site".
// Detection correctness is unaffected.
removedBuf := holders[len(holders)-1].buf
holders[len(holders)-1] = stackGID{}
holders = holders[:len(holders)-1]
releaseStackBuf(removedBuf)
}
if len(holders) == 0 {
// Delete the map key so the mutex pointer is not retained, allowing GC of
// the struct it's embedded in. Recycle the backing slice via pool so the
// next postLock on any mutex can reuse it instead of allocating.
if cap(holders) > 0 {
holdersPool.Put(holders[:0])
}
delete(l.cur, p)
} else {
l.cur[p] = holders
Expand Down
159 changes: 157 additions & 2 deletions deadlock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,166 @@ func TestStarvedRLockMultipleReaders(t *testing.T) {
<-ch
}

func TestLockDuplicate(t *testing.T) {
// TestManyReadersFewWriters stresses the RWMutex tracking under high read
// concurrency with infrequent writers. Existing tests use at most ~10
// goroutines with a balanced reader/writer mix; real-world usage often has
// dozens of readers racing against a handful of writers. This exercises:
// - the per-goroutine cur map ref-counting under heavy concurrent RLock/RUnlock,
// where many goroutines simultaneously call postLock and postUnlock;
// - lock-order detection with a large number of concurrent reader entries;
// - timer pool contention when many DeadlockTimeout timers are live at once.
func TestManyReadersFewWriters(t *testing.T) {
defer restore()()
Opts.DeadlockTimeout = time.Millisecond * 5000
var mu RWMutex
var wg sync.WaitGroup

const numReaders = 100
const numWriters = 3
const readerIters = 50
const writerIters = 10

for i := 0; i < numReaders; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for k := 0; k < readerIters; k++ {
mu.RLock()
time.Sleep(time.Duration(rand.Intn(500)) * time.Microsecond)
mu.RUnlock()
}
}()
}

for i := 0; i < numWriters; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for k := 0; k < writerIters; k++ {
mu.Lock()
time.Sleep(time.Duration(rand.Intn(200)) * time.Microsecond)
mu.Unlock()
time.Sleep(time.Duration(rand.Intn(1000)) * time.Microsecond)
}
}()
}

wg.Wait()
}

// TestConcurrentLockOrderDetection verifies that lock-order violation detection
// works correctly under real goroutine contention. TestLockOrder runs its two
// goroutines sequentially (wg.Wait() between them), so the order map and cur map
// are only contested by one goroutine at a time. Here, many goroutines
// simultaneously call preLock, postLock, and postUnlock — all contending on
// lo.mu — while each one independently detects the same A→B vs B→A conflict.
// This stresses concurrent iteration of lo.cur, concurrent reads/writes to
// lo.order, and concurrent invocations of OnPotentialDeadlock.
func TestConcurrentLockOrderDetection(t *testing.T) {
defer restore()()
Opts.DeadlockTimeout = 0
var deadlocks uint32
Opts.OnPotentialDeadlock = func() {
atomic.AddUint32(&deadlocks, 1)
}

var a, b Mutex

// Establish the A→B ordering in the lock-order map.
a.Lock()
b.Lock()
b.Unlock()
a.Unlock()

// Launch many goroutines that all acquire B→A concurrently. Each one
// triggers a violation in preLock when it tries to acquire A while holding
// B. Because every goroutine acquires in the same order (B then A), they
// cannot actually deadlock with each other.
var wg sync.WaitGroup
start := make(chan struct{})
for i := 0; i < 20; i++ {
wg.Add(1)
go func() {
defer wg.Done()
<-start
for k := 0; k < 10; k++ {
b.Lock()
a.Lock()
a.Unlock()
b.Unlock()
}
}()
}

close(start)
wg.Wait()

if d := atomic.LoadUint32(&deadlocks); d == 0 {
t.Fatal("expected at least 1 lock-order violation, detected 0")
}
}

// TestDeadlockTimeoutTransientNoHolder exercises the reschedule path in
// onDeadlockTimeout. When the deadlock timer fires and lo.cur has no holder
// for the waited-on lock — a transient state that can occur between the
// previous holder's postUnlock and the waiter's deregister — the callback
// reschedules itself via time.AfterFunc instead of reporting a false deadlock.
//
// In production this window is extremely narrow (nanoseconds between
// postUnlock removing the holder from lo.cur and the waiter's lockFn
// returning), so it cannot be triggered reliably through normal Lock/Unlock
// calls. Instead we invoke onDeadlockTimeout directly with a pendingEntry
// whose lock has no holders in lo.cur, deterministically hitting the
// reschedule branch. We then set done=1 (simulating the waiter acquiring the
// lock) and verify the rescheduled timer fires harmlessly with no false
// deadlock report.
func TestDeadlockTimeoutTransientNoHolder(t *testing.T) {
defer restore()()
Opts.DisableLockOrderDetection = true
Opts.DeadlockTimeout = 10 * time.Millisecond
var deadlocks uint32
Opts.OnPotentialDeadlock = func() {
atomic.AddUint32(&deadlocks, 1)
}

var mu Mutex

// Build a pendingEntry as if a goroutine were blocked waiting on mu.
// mu has never been locked, so lo.cur has no holders for it.
e := newPendingEntry()
e.ptr = &mu
e.gid = 999
atomic.StoreInt32(&e.done, 0)

// Directly invoke the timeout handler. It will find no holders in lo.cur
// and take the reschedule branch (time.AfterFunc) instead of reporting.
onDeadlockTimeout(e)

// Simulate the waiter acquiring the lock. The rescheduled timer's checkFn
// will see done=1 and no-op.
atomic.StoreInt32(&e.done, 1)

// Wait long enough for the rescheduled timer to fire and confirm it
// does not report a false deadlock.
time.Sleep(Opts.DeadlockTimeout * 3)

if d := atomic.LoadUint32(&deadlocks); d != 0 {
t.Fatalf("expected 0 false deadlocks from transient no-holder state, got %d", d)
}
}

func TestLockDuplicate(t *testing.T) {
// No restore() here: the goroutines below permanently block inside lockFn
// after preLock detects recursion. A deferred restore() would write to Opts
// while those goroutines are still reading Opts.DeadlockTimeout, causing a
// data race under -race. Omitting restore is safe because every other test
// saves/sets its own Opts via restore().
Opts.DeadlockTimeout = 0
var deadlocks uint32
detected := make(chan struct{}, 2)
Opts.OnPotentialDeadlock = func() {
atomic.AddUint32(&deadlocks, 1)
detected <- struct{}{}
}
var a RWMutex
var b Mutex
Expand All @@ -233,7 +387,8 @@ func TestLockDuplicate(t *testing.T) {
b.Unlock()
b.Unlock()
}()
time.Sleep(time.Second * 1)
<-detected
<-detected
if atomic.LoadUint32(&deadlocks) != 2 {
t.Fatalf("expected 2 deadlocks, detected %d", deadlocks)
}
Expand Down
Loading
Loading