From a6d9e553a7baca278fe656d1c45e042ab39538bf Mon Sep 17 00:00:00 2001 From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com> Date: Thu, 22 May 2025 22:05:07 +0530 Subject: [PATCH] Fix deadlock in semi-sync monitor (#18276) Signed-off-by: Manan Gupta --- .../tabletmanager/semisyncmonitor/monitor.go | 26 +++++++--- .../semisyncmonitor/monitor_test.go | 47 +++++++++++++++++++ 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor.go b/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor.go index 0bc04e1fd39..c394b329484 100644 --- a/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor.go +++ b/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor.go @@ -66,6 +66,10 @@ type Monitor struct { // the semisync_heartbeat table. clearTicks *timer.Timer + // timerMu protects operations on the timers to prevent deadlocks + // This must be acquired before mu if both are needed. + timerMu sync.Mutex + // mu protects the fields below. mu sync.Mutex appPool *dbconnpool.ConnectionPool @@ -123,8 +127,13 @@ func CreateTestSemiSyncMonitor(db *fakesqldb.DB, exporter *servenv.Exporter) *Mo // Open starts the monitor. func (m *Monitor) Open() { + // First acquire the timer mutex to prevent deadlock - we acquire in a consistent order + m.timerMu.Lock() + defer m.timerMu.Unlock() + m.mu.Lock() defer m.mu.Unlock() + // The check for config being nil is only requried for tests. if m.isOpen || m.config == nil || m.config.DB == nil { // If we are already open, then there is nothing to do @@ -145,16 +154,19 @@ func (m *Monitor) Open() { // Close stops the monitor. func (m *Monitor) Close() { - m.mu.Lock() - defer m.mu.Unlock() - if !m.isOpen { - // If we are already closed, then there is nothing to do - return - } - m.isOpen = false + // First acquire the timer mutex to prevent deadlock - we acquire in a consistent order + m.timerMu.Lock() + defer m.timerMu.Unlock() log.Info("SemiSync Monitor: closing") + // We close the ticks before we acquire the mu mutex to prevent deadlock. + // The timer Close should not be called while holding a mutex that the function + // the timer runs also acquires. m.clearTicks.Stop() m.ticks.Stop() + // Acquire the mu mutex to update the isOpen field. + m.mu.Lock() + defer m.mu.Unlock() + m.isOpen = false m.appPool.Close() } diff --git a/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor_test.go b/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor_test.go index 83bf2c2259c..9586514914b 100644 --- a/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor_test.go +++ b/go/vt/vttablet/tabletmanager/semisyncmonitor/monitor_test.go @@ -619,6 +619,53 @@ func TestWaitUntilSemiSyncUnblocked(t *testing.T) { require.True(t, m.isClosed()) } +// TestDeadlockOnClose tests the deadlock that can occur when calling Close(). +// Look at https://github.com/vitessio/vitess/issues/18275 for more details. +func TestDeadlockOnClose(t *testing.T) { + db := fakesqldb.New(t) + defer db.Close() + params := db.ConnParams() + cp := *params + dbc := dbconfigs.NewTestDBConfigs(cp, cp, "") + config := &tabletenv.TabletConfig{ + DB: dbc, + SemiSyncMonitor: tabletenv.SemiSyncMonitorConfig{ + // Extremely low interval to trigger the deadlock quickly. + // This makes the monitor try and write that it is still blocked quite aggressively. + Interval: 10 * time.Millisecond, + }, + } + m := NewMonitor(config, exporter) + + // Set up for semisync to be blocked + db.SetNeverFail(true) + db.AddQuery(semiSyncWaitSessionsRead, sqltypes.MakeTestResult(sqltypes.MakeTestFields("variable_value", "varchar"), "1")) + + // Open the monitor + m.Open() + defer m.Close() + + // We will now try to close and open the monitor multiple times to see if we can trigger a deadlock. + finishCh := make(chan int) + go func() { + count := 100 + for i := 0; i < count; i++ { + m.Close() + m.Open() + time.Sleep(20 * time.Millisecond) + } + close(finishCh) + }() + + select { + case <-finishCh: + // The test finished without deadlocking. + case <-time.After(5 * time.Second): + // The test timed out, which means we deadlocked. + t.Fatalf("Deadlock occurred while closing the monitor") + } +} + // TestSemiSyncMonitor tests the semi-sync monitor as a black box. // It only calls the exported methods to see they work as intended. func TestSemiSyncMonitor(t *testing.T) {