Skip to content

Commit ec76c9a

Browse files
menghanleaswars
authored andcommitted
grpclb: fix deadlock in grpclb connection cache (#3017)
Before the fix, if the timer to remove a SubConn fires at the same time NewSubConn cancels the timer, it caused a mutex leak and deadlock.
1 parent fc2c3d6 commit ec76c9a

File tree

2 files changed

+44
-1
lines changed

2 files changed

+44
-1
lines changed

Diff for: balancer/grpclb/grpclb_util.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,13 @@ func (ccc *lbCacheClientConn) RemoveSubConn(sc balancer.SubConn) {
173173

174174
timer := time.AfterFunc(ccc.timeout, func() {
175175
ccc.mu.Lock()
176+
defer ccc.mu.Unlock()
176177
if entry.abortDeleting {
177178
return
178179
}
179180
ccc.cc.RemoveSubConn(sc)
180181
delete(ccc.subConnToAddr, sc)
181182
delete(ccc.subConnCache, addr)
182-
ccc.mu.Unlock()
183183
})
184184
entry.cancel = func() {
185185
if !timer.Stop() {

Diff for: balancer/grpclb/grpclb_util_test.go

+43
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,46 @@ func TestLBCacheClientConnReuse(t *testing.T) {
217217
t.Fatal(err)
218218
}
219219
}
220+
221+
// Test that if the timer to remove a SubConn fires at the same time NewSubConn
222+
// cancels the timer, it doesn't cause deadlock.
223+
func TestLBCache_RemoveTimer_New_Race(t *testing.T) {
224+
mcc := newMockClientConn()
225+
if err := checkMockCC(mcc, 0); err != nil {
226+
t.Fatal(err)
227+
}
228+
229+
ccc := newLBCacheClientConn(mcc)
230+
ccc.timeout = time.Nanosecond
231+
if err := checkCacheCC(ccc, 0, 0); err != nil {
232+
t.Fatal(err)
233+
}
234+
235+
sc, _ := ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
236+
// One subconn in MockCC.
237+
if err := checkMockCC(mcc, 1); err != nil {
238+
t.Fatal(err)
239+
}
240+
// No subconn being deleted, and one in CacheCC.
241+
if err := checkCacheCC(ccc, 0, 1); err != nil {
242+
t.Fatal(err)
243+
}
244+
245+
done := make(chan struct{})
246+
247+
go func() {
248+
for i := 0; i < 1000; i++ {
249+
// Remove starts a timer with 1 ns timeout, the NewSubConn will race
250+
// with with the timer.
251+
ccc.RemoveSubConn(sc)
252+
sc, _ = ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{})
253+
}
254+
close(done)
255+
}()
256+
257+
select {
258+
case <-time.After(time.Second):
259+
t.Fatalf("Test didn't finish within 1 second. Deadlock")
260+
case <-done:
261+
}
262+
}

0 commit comments

Comments
 (0)