diff --git a/alloc_test.go b/alloc_test.go
index c24f7cd..f798a27 100644
--- a/alloc_test.go
+++ b/alloc_test.go
@@ -6,7 +6,7 @@ import (
 
 func BenchmarkRegisterDeregister(b *testing.B) {
 	for i := 0; i < b.N; i++ {
-		id := dw.register(nil, nil, 0)
+		id := dw.register(nil, nil, 0, Opts.DeadlockTimeout)
 		dw.deregister(id)
 	}
 }
diff --git a/deadlock.go b/deadlock.go
index 7c7507e..33766b8 100644
--- a/deadlock.go
+++ b/deadlock.go
@@ -180,8 +180,8 @@ func preLock(stack []uintptr, p interface{}) {
 	lo.preLock(stack, p)
 }
 
-func postLock(stack []uintptr, p interface{}) {
-	lo.postLock(stack, p)
+func postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) {
+	lo.postLock(stack, buf, p)
 }
 
 func postUnlock(p interface{}) {
@@ -193,19 +193,24 @@ func lock(lockFn func(), ptr interface{}) {
 		lockFn()
 		return
 	}
-	stack := callers(1)
+	stack, buf := callers(1)
+	// Cache timeout before preLock so all Opts reads complete before preLock
+	// may call OnPotentialDeadlock. If preLock detects a problem (recursive
+	// lock, order violation) the goroutine may block forever in lockFn below,
+	// and reading Opts after preLock would race with any later Opts write.
+	timeout := Opts.DeadlockTimeout
 	preLock(stack, ptr)
-	if Opts.DeadlockTimeout <= 0 {
+	if timeout <= 0 {
 		lockFn()
 	} else {
 		currentID := goid.Get()
-		e := dw.register(stack, ptr, currentID)
+		e := dw.register(stack, ptr, currentID, timeout)
 		lockFn()
 		dw.deregister(e)
-		postLock(stack, ptr)
+		postLock(stack, buf, ptr)
 		return
 	}
-	postLock(stack, ptr)
+	postLock(stack, buf, ptr)
 }
 
 // pendingEntry tracks a goroutine that is waiting to acquire a lock. Entries are
@@ -217,7 +222,7 @@ func lock(lockFn func(), ptr interface{}) {
 //   - The done flag synchronizes the callback with deregister: deregister sets done=1
 //     before calling Stop(), and the callback checks done before acting. Because both
 //     use atomic operations, the callback is guaranteed to observe done=1 if deregister
-//     has already run — even if the runtime already scheduled the callback.
+//     has already run, even if the runtime already scheduled the callback.
 //   - An entry is only returned to the pool when timer.Stop() returns true, meaning
 //     the timer was successfully cancelled and the callback will never run. This prevents
 //     a recycled entry from being mutated by an in-flight callback.
@@ -234,7 +239,7 @@ type pendingEntry struct {
 
 func newPendingEntry() *pendingEntry {
 	e := &pendingEntry{}
-	// Capture e by pointer so the closure is stable across pool reuse — no new
+	// Capture e by pointer so the closure is stable across pool reuse, no new
 	// closure allocation when the entry is recycled.
 	e.checkFn = func() {
 		// If the lock was acquired (done=1), the entry may already be back in the
@@ -257,7 +262,7 @@ type deadlockWatcher struct{}
 
 var dw deadlockWatcher
 
-func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64) *pendingEntry {
+func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64, timeout time.Duration) *pendingEntry {
 	var e *pendingEntry
 	if shouldDisableTimerPool() {
 		e = newPendingEntry()
@@ -272,11 +277,11 @@ func (w *deadlockWatcher) register(stack []uintptr, ptr interface{}, gid int64)
 		// First use (freshly allocated entry): create the AfterFunc timer.
 		// AfterFunc avoids the channel-drain problems of channel-based timers,
 		// which are especially problematic under testing/synctest.
-		e.timer = time.AfterFunc(Opts.DeadlockTimeout, e.checkFn)
+		e.timer = time.AfterFunc(timeout, e.checkFn)
 	} else {
 		// Reused from pool: the timer was previously Stop()'d successfully
 		// (guaranteed by deregister), so Reset is safe here.
-		e.timer.Reset(Opts.DeadlockTimeout)
+		e.timer.Reset(timeout)
 	}
 	return e
 }
@@ -305,12 +310,12 @@ func onDeadlockTimeout(e *pendingEntry) {
 	lo.mu.Lock()
 	holders, ok := lo.cur[e.ptr]
 	if !ok || len(holders) == 0 {
-		// Lock appears unheld (transient state — holder may have just released).
+		// Lock appears unheld (transient state, holder may have just released).
 		// Reschedule if the waiter is still pending. Note: this creates a new timer
 		// (e.timer is not updated), so if deregister runs later it will Stop() the
 		// original (already-fired) timer, get false, and skip pooling. The new timer's
 		// callback will then observe done=1 and no-op. This is safe but means the
-		// entry won't be recycled — acceptable since this is the rare timeout path.
+		// entry won't be recycled, acceptable since this is the rare timeout path.
 		lo.mu.Unlock()
 		if atomic.LoadInt32(&e.done) == 0 {
 			time.AfterFunc(Opts.DeadlockTimeout, e.checkFn)
@@ -361,6 +366,7 @@ type lockOrder struct {
 type stackGID struct {
 	stack []uintptr
 	gid   int64
+	buf   *[stackBufSize]uintptr // pooled backing array; returned via releaseStackBuf in postUnlock
 }
 
 type ss struct {
@@ -377,10 +383,26 @@ func newLockOrder() *lockOrder {
 	}
 }
 
-func (l *lockOrder) postLock(stack []uintptr, p interface{}) {
+// holdersPool recycles []stackGID slices used by lockOrder.cur to track which
+// goroutines currently hold each lock. Slices are returned to the pool in
+// postUnlock when a lock's holder count drops to zero, and reused in postLock
+// for the next lock acquisition, avoiding a new slice allocation per mutex.
+var holdersPool sync.Pool
+
+// postLock records the current goroutine as a holder of lock p. It tries to
+// reuse a pooled []stackGID slice before allocating, and stores the pooled
+// stack buffer in the entry so postUnlock can release it later.
+func (l *lockOrder) postLock(stack []uintptr, buf *[stackBufSize]uintptr, p interface{}) {
 	gid := goid.Get()
+	entry := stackGID{stack, gid, buf}
 	l.mu.Lock()
-	l.cur[p] = append(l.cur[p], stackGID{stack, gid})
+	holders := l.cur[p]
+	if holders == nil {
+		if s, ok := holdersPool.Get().([]stackGID); ok {
+			holders = s[:0]
+		}
+	}
+	l.cur[p] = append(holders, entry)
 	l.mu.Unlock()
 }
 
@@ -434,7 +456,9 @@ func (l *lockOrder) preLock(stack []uintptr, p interface{}) {
 				Opts.mu.Unlock()
 				Opts.OnPotentialDeadlock()
 			}
-			l.order[newBeforeAfter(b, p)] = ss{bs.stack, stack}
+			// Copy both stacks: they're backed by pooled buffers that will be
+			// recycled in postUnlock, but l.order entries persist until MaxMapSize.
+			l.order[newBeforeAfter(b, p)] = ss{copyStack(bs.stack), copyStack(stack)}
 			if len(l.order) == Opts.MaxMapSize { // Reset the map to keep memory footprint bounded.
 				l.order = map[beforeAfter]ss{}
 			}
@@ -455,15 +479,33 @@ func (l *lockOrder) postUnlock(p interface{}) {
 		}
 	}
 	if idx >= 0 {
+		removedBuf := holders[idx].buf
 		holders[idx] = holders[len(holders)-1]
-		holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array.
+		holders[len(holders)-1] = stackGID{}
 		holders = holders[:len(holders)-1]
+		releaseStackBuf(removedBuf)
 	} else if len(holders) > 0 {
-		// Cross-goroutine unlock: no matching gid found, remove an arbitrary entry.
-		holders[len(holders)-1] = stackGID{} // Zero to avoid retaining stack slice in underlying array.
+		// Cross-goroutine unlock: Go permits one goroutine to Lock and a different
+		// goroutine to Unlock, so the unlocking gid may not match any holder entry.
+		// This is a rare edge case in practice, the vast majority of code unlocks
+		// from the same goroutine that locked. We remove an arbitrary entry to keep
+		// the holder count consistent with the real lock state (the lock *was*
+		// released, so one entry must go). The trade-off: for RWMutex with multiple
+		// concurrent readers we may discard the wrong reader's stack trace, making a
+		// future deadlock report show a slightly misleading "previous lock site".
+		// Detection correctness is unaffected.
+		removedBuf := holders[len(holders)-1].buf
+		holders[len(holders)-1] = stackGID{}
 		holders = holders[:len(holders)-1]
+		releaseStackBuf(removedBuf)
 	}
 	if len(holders) == 0 {
+		// Delete the map key so the mutex pointer is not retained, allowing GC of
+		// the struct it's embedded in. Recycle the backing slice via pool so the
+		// next postLock on any mutex can reuse it instead of allocating.
+		if cap(holders) > 0 {
+			holdersPool.Put(holders[:0])
+		}
 		delete(l.cur, p)
 	} else {
 		l.cur[p] = holders
diff --git a/deadlock_test.go b/deadlock_test.go
index 9714b44..177158b 100644
--- a/deadlock_test.go
+++ b/deadlock_test.go
@@ -212,12 +212,166 @@ func TestStarvedRLockMultipleReaders(t *testing.T) {
 	<-ch
 }
 
-func TestLockDuplicate(t *testing.T) {
+// TestManyReadersFewWriters stresses the RWMutex tracking under high read
+// concurrency with infrequent writers. Existing tests use at most ~10
+// goroutines with a balanced reader/writer mix; real-world usage often has
+// dozens of readers racing against a handful of writers. This exercises:
+//   - the per-goroutine cur map ref-counting under heavy concurrent RLock/RUnlock,
+//     where many goroutines simultaneously call postLock and postUnlock;
+//   - lock-order detection with a large number of concurrent reader entries;
+//   - timer pool contention when many DeadlockTimeout timers are live at once.
+func TestManyReadersFewWriters(t *testing.T) {
 	defer restore()()
+	Opts.DeadlockTimeout = time.Millisecond * 5000
+	var mu RWMutex
+	var wg sync.WaitGroup
+
+	const numReaders = 100
+	const numWriters = 3
+	const readerIters = 50
+	const writerIters = 10
+
+	for i := 0; i < numReaders; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for k := 0; k < readerIters; k++ {
+				mu.RLock()
+				time.Sleep(time.Duration(rand.Intn(500)) * time.Microsecond)
+				mu.RUnlock()
+			}
+		}()
+	}
+
+	for i := 0; i < numWriters; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for k := 0; k < writerIters; k++ {
+				mu.Lock()
+				time.Sleep(time.Duration(rand.Intn(200)) * time.Microsecond)
+				mu.Unlock()
+				time.Sleep(time.Duration(rand.Intn(1000)) * time.Microsecond)
+			}
+		}()
+	}
+
+	wg.Wait()
+}
+
+// TestConcurrentLockOrderDetection verifies that lock-order violation detection
+// works correctly under real goroutine contention. TestLockOrder runs its two
+// goroutines sequentially (wg.Wait() between them), so the order map and cur map
+// are only contested by one goroutine at a time. Here, many goroutines
+// simultaneously call preLock, postLock, and postUnlock — all contending on
+// lo.mu — while each one independently detects the same A→B vs B→A conflict.
+// This stresses concurrent iteration of lo.cur, concurrent reads/writes to
+// lo.order, and concurrent invocations of OnPotentialDeadlock.
+func TestConcurrentLockOrderDetection(t *testing.T) {
+	defer restore()()
+	Opts.DeadlockTimeout = 0
+	var deadlocks uint32
+	Opts.OnPotentialDeadlock = func() {
+		atomic.AddUint32(&deadlocks, 1)
+	}
+
+	var a, b Mutex
+
+	// Establish the A→B ordering in the lock-order map.
+	a.Lock()
+	b.Lock()
+	b.Unlock()
+	a.Unlock()
+
+	// Launch many goroutines that all acquire B→A concurrently. Each one
+	// triggers a violation in preLock when it tries to acquire A while holding
+	// B. Because every goroutine acquires in the same order (B then A), they
+	// cannot actually deadlock with each other.
+	var wg sync.WaitGroup
+	start := make(chan struct{})
+	for i := 0; i < 20; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			<-start
+			for k := 0; k < 10; k++ {
+				b.Lock()
+				a.Lock()
+				a.Unlock()
+				b.Unlock()
+			}
+		}()
+	}
+
+	close(start)
+	wg.Wait()
+
+	if d := atomic.LoadUint32(&deadlocks); d == 0 {
+		t.Fatal("expected at least 1 lock-order violation, detected 0")
+	}
+}
+
+// TestDeadlockTimeoutTransientNoHolder exercises the reschedule path in
+// onDeadlockTimeout. When the deadlock timer fires and lo.cur has no holder
+// for the waited-on lock — a transient state that can occur between the
+// previous holder's postUnlock and the waiter's deregister — the callback
+// reschedules itself via time.AfterFunc instead of reporting a false deadlock.
+//
+// In production this window is extremely narrow (nanoseconds between
+// postUnlock removing the holder from lo.cur and the waiter's lockFn
+// returning), so it cannot be triggered reliably through normal Lock/Unlock
+// calls. Instead we invoke onDeadlockTimeout directly with a pendingEntry
+// whose lock has no holders in lo.cur, deterministically hitting the
+// reschedule branch. We then set done=1 (simulating the waiter acquiring the
+// lock) and verify the rescheduled timer fires harmlessly with no false
+// deadlock report.
+func TestDeadlockTimeoutTransientNoHolder(t *testing.T) {
+	defer restore()()
+	Opts.DisableLockOrderDetection = true
+	Opts.DeadlockTimeout = 10 * time.Millisecond
+	var deadlocks uint32
+	Opts.OnPotentialDeadlock = func() {
+		atomic.AddUint32(&deadlocks, 1)
+	}
+
+	var mu Mutex
+
+	// Build a pendingEntry as if a goroutine were blocked waiting on mu.
+	// mu has never been locked, so lo.cur has no holders for it.
+	e := newPendingEntry()
+	e.ptr = &mu
+	e.gid = 999
+	atomic.StoreInt32(&e.done, 0)
+
+	// Directly invoke the timeout handler. It will find no holders in lo.cur
+	// and take the reschedule branch (time.AfterFunc) instead of reporting.
+	onDeadlockTimeout(e)
+
+	// Simulate the waiter acquiring the lock. The rescheduled timer's checkFn
+	// will see done=1 and no-op.
+	atomic.StoreInt32(&e.done, 1)
+
+	// Wait long enough for the rescheduled timer to fire and confirm it
+	// does not report a false deadlock.
+	time.Sleep(Opts.DeadlockTimeout * 3)
+
+	if d := atomic.LoadUint32(&deadlocks); d != 0 {
+		t.Fatalf("expected 0 false deadlocks from transient no-holder state, got %d", d)
+	}
+}
+
+func TestLockDuplicate(t *testing.T) {
+	// No restore() here: the goroutines below permanently block inside lockFn
+	// after preLock detects recursion. A deferred restore() would write to Opts
+	// while those goroutines are still reading Opts.DeadlockTimeout, causing a
+	// data race under -race. Omitting restore is safe because every other test
+	// saves/sets its own Opts via restore().
 	Opts.DeadlockTimeout = 0
 	var deadlocks uint32
+	detected := make(chan struct{}, 2)
 	Opts.OnPotentialDeadlock = func() {
 		atomic.AddUint32(&deadlocks, 1)
+		detected <- struct{}{}
 	}
 	var a RWMutex
 	var b Mutex
@@ -233,7 +387,8 @@ func TestLockDuplicate(t *testing.T) {
 		b.Unlock()
 		b.Unlock()
 	}()
-	time.Sleep(time.Second * 1)
+	<-detected
+	<-detected
 	if atomic.LoadUint32(&deadlocks) != 2 {
 		t.Fatalf("expected 2 deadlocks, detected %d", deadlocks)
 	}
diff --git a/stacktraces.go b/stacktraces.go
index d93050f..fb5f308 100644
--- a/stacktraces.go
+++ b/stacktraces.go
@@ -13,9 +13,38 @@ import (
 	"sync"
 )
 
-func callers(skip int) []uintptr {
-	s := make([]uintptr, 50) // Most relevant context seem to appear near the top of the stack.
-	return s[:runtime.Callers(2+skip, s)]
+const stackBufSize = 50
+
+var stackBufPool = sync.Pool{
+	New: func() interface{} {
+		return new([stackBufSize]uintptr)
+	},
+}
+
+// callers returns a stack trace backed by a pooled buffer. The caller must
+// eventually return buf via releaseStackBuf — typically through the
+// postLock/postUnlock path which stores it in stackGID.buf.
+func callers(skip int) ([]uintptr, *[stackBufSize]uintptr) {
+	buf := stackBufPool.Get().(*[stackBufSize]uintptr)
+	n := runtime.Callers(2+skip, buf[:])
+	return buf[:n], buf
+}
+
+// releaseStackBuf returns a pooled stack buffer obtained from callers(). Safe to
+// call with nil (e.g. when the buffer was already handed off via stackGID.buf).
+func releaseStackBuf(buf *[stackBufSize]uintptr) {
+	if buf != nil {
+		stackBufPool.Put(buf)
+	}
+}
+
+// copyStack creates an independent copy of a stack trace. Required when storing
+// stacks in long-lived structures (e.g. l.order) because the originals are backed
+// by pooled buffers that will be recycled in postUnlock.
+func copyStack(s []uintptr) []uintptr {
+	c := make([]uintptr, len(s))
+	copy(c, s)
+	return c
 }
 
 func printStack(w io.Writer, stack []uintptr) {
diff --git a/synctest_comparison_test.go b/synctest_comparison_test.go
index 29e1a12..5317936 100644
--- a/synctest_comparison_test.go
+++ b/synctest_comparison_test.go
@@ -9,8 +9,13 @@ func TestNormalDeadlockDetection(t *testing.T) {
 	oldTimeout := Opts.DeadlockTimeout
 	oldOnDeadlock := Opts.OnPotentialDeadlock
 	Opts.DeadlockTimeout = 20 * time.Millisecond
+	// onDeadlockTimeout calls Opts.OnPotentialDeadlock on a timer goroutine
+	// (line 352 of deadlock.go). The channel send inside the callback creates a
+	// happens-before edge so the deferred restore below won't race with the read.
+	callbackDone := make(chan struct{}, 1)
 	Opts.OnPotentialDeadlock = func() {
 		t.Log("Deadlock detected!")
+		callbackDone <- struct{}{}
 	}
 	defer func() {
 		Opts.DeadlockTimeout = oldTimeout
@@ -33,4 +38,5 @@ func TestNormalDeadlockDetection(t *testing.T) {
 	time.Sleep(30 * time.Millisecond)
 	mu.Unlock()
 	<-done
+	<-callbackDone
 }
diff --git a/trylock.go b/trylock.go
index 43c7a5f..b29b263 100644
--- a/trylock.go
+++ b/trylock.go
@@ -28,12 +28,15 @@ func trylock(lockFn func() bool, ptr interface{}) bool {
 	if Opts.Disable {
 		return lockFn()
 	}
-	stack := callers(1)
+	stack, buf := callers(1)
 	preLock(stack, ptr)
 	ret := lockFn()
 	if ret {
-		postLock(stack, ptr)
+		postLock(stack, buf, ptr)
 	} else {
+		// TryLock failed: the stack won't be stored in stackGID.buf (postLock is
+		// skipped), so we must release the pooled buffer directly to avoid a leak.
+		releaseStackBuf(buf)
 		postUnlock(ptr)
 	}
 	return ret
diff --git a/trylock_test.go b/trylock_test.go
index 8df5519..c700b1d 100644
--- a/trylock_test.go
+++ b/trylock_test.go
@@ -181,11 +181,17 @@ func TestRWMutexTryLock(t *testing.T) {
 }
 
 func TestTryLockDuplicate(t *testing.T) {
-	defer restore()()
+	// No restore() here: the goroutines below permanently block inside lockFn
+	// after preLock detects recursion. A deferred restore() would write to Opts
+	// while those goroutines are still reading Opts.DeadlockTimeout, causing a
+	// data race under -race. Omitting restore is safe because every other test
+	// saves/sets its own Opts via restore().
 	Opts.DeadlockTimeout = 0
 	var deadlocks uint32
+	detected := make(chan struct{}, 2)
 	Opts.OnPotentialDeadlock = func() {
 		atomic.AddUint32(&deadlocks, 1)
+		detected <- struct{}{}
 	}
 	var a RWMutex
 	var b Mutex
@@ -205,7 +211,8 @@ func TestTryLockDuplicate(t *testing.T) {
 		b.Unlock()
 		b.Unlock()
 	}()
-	time.Sleep(time.Second * 1)
+	<-detected
+	<-detected
 	if atomic.LoadUint32(&deadlocks) != 2 {
 		t.Fatalf("expected 2 deadlocks, detected %d", deadlocks)
 	}