Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1fcd08b
WIP
timvaillancourt Feb 19, 2026
24e9fa0
more WIP
timvaillancourt Feb 19, 2026
30b8f74
fix fd exhausted test
timvaillancourt Feb 19, 2026
26ea98c
tweaks
timvaillancourt Feb 19, 2026
2f273b8
update signature
timvaillancourt Feb 19, 2026
445a396
several tweaks
timvaillancourt Mar 4, 2026
03acfea
Merge remote-tracking branch 'origin/main' into fix-serveNonPrimary-n…
timvaillancourt Mar 4, 2026
12f6784
rename
timvaillancourt Mar 4, 2026
77693b1
remove unneeded tests
timvaillancourt Mar 4, 2026
63f677d
simplify, improve comments
timvaillancourt Mar 4, 2026
f87b090
reorder, improve comments
timvaillancourt Mar 4, 2026
257b0dc
more comment tweaks
timvaillancourt Mar 4, 2026
baf207d
just return bool
timvaillancourt Mar 4, 2026
b054271
missing test update
timvaillancourt Mar 5, 2026
2b2a661
Merge remote-tracking branch 'origin/main' into fix-serveNonPrimary-n…
timvaillancourt Mar 11, 2026
ef4aec1
fix gofumpt formatting in mysqld_test.go
timvaillancourt Mar 11, 2026
4216072
revert context-aware Close() changes
timvaillancourt Mar 11, 2026
a826a86
Shutdown: stop replication before shutting down mysqld
timvaillancourt Mar 11, 2026
97056eb
add test for STOP REPLICA in Shutdown
timvaillancourt Mar 11, 2026
db16064
address copilot review feedback
timvaillancourt Mar 11, 2026
ad5900b
address copilot review feedback (round 2)
timvaillancourt Mar 11, 2026
28d06cd
use sync.Once for fakesqldb close in TestMysqldIsLocalMySQLDown
timvaillancourt Mar 11, 2026
536c5fb
add changelog entry for STOP REPLICA before MySQL shutdown
timvaillancourt Mar 11, 2026
ff8eb2d
fix comment: returning nil allows ChangeTabletType to succeed
timvaillancourt Mar 11, 2026
52a789b
fix SetReplicationSource to update topo when mysqld is down
timvaillancourt Mar 12, 2026
d3d1742
address copilot review: remove unused mysqlDaemon, avoid duplicate probe
timvaillancourt Mar 12, 2026
9a4aac2
address Copilot review feedback
timvaillancourt Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions changelog/24.0/24.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- [QueryThrottler Event-Driven Configuration Updates](#vttablet-querythrottler-config-watch)
- [New `in_order_completion_pending_count` field in OnlineDDL outputs](#vttablet-onlineddl-in-order-completion-count)
- [Tablet Shutdown Tracking and Connection Validation](#vttablet-tablet-shutdown-validation)
- [`STOP REPLICA` before MySQL shutdown](#vttablet-stop-replica-before-shutdown)
- **[VTOrc](#minor-changes-vtorc)**
- [New `--cell` Flag](#vtorc-cell-flag)
- [Improved VTOrc Discovery Logging](#vtorc-improved-discovery-logging)
Expand Down Expand Up @@ -215,6 +216,12 @@ Vitess now tracks when tablets cleanly shut down and validates tablet records be

**Note**: This is a best-effort mechanism. Tablets that are killed or crash may not have the opportunity to set this field, in which case components will continue to attempt connections as they did in v23 and earlier.

#### <a id="vttablet-stop-replica-before-shutdown"/>`STOP REPLICA` before MySQL shutdown</a>

`Mysqld.Shutdown()` now issues a best-effort `STOP REPLICA` (with a 3-second timeout) before shutting down MySQL. This addresses a brief race in MySQL's [`close_connections()`](https://github.com/mysql/mysql-server/blob/mysql-8.4.0/sql/mysqld.cc#L2368-L2391) where `close_listener()` removes the unix socket before `end_slave()` stops replication threads. Without this, there is a small window where the socket is gone but replication is still running.

See [#19624](https://github.com/vitessio/vitess/pull/19624) and [#19625](https://github.com/vitessio/vitess/issues/19625) for details.

### <a id="minor-changes-vtorc"/>VTOrc</a>

#### <a id="vtorc-cell-flag"/>New `--cell` Flag</a>
Expand Down
78 changes: 78 additions & 0 deletions go/test/endtoend/mysqlctl/mysqlctl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,21 @@ import (
"fmt"
"os"
"os/exec"
"path"
"strconv"
"strings"
"syscall"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/constants/sidecar"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/mysqlctl"
)

var (
Expand Down Expand Up @@ -155,3 +164,72 @@ func TestAutoDetect(t *testing.T) {
err = clusterInstance.VtctldClientProcess.InitializeShard(keyspaceName, shardName, cell, primaryTablet.TabletUID)
require.NoError(t, err)
}

func TestIsLocalMySQLDown(t *testing.T) {
tabletDir := path.Join(os.Getenv("VTDATAROOT"), fmt.Sprintf("vt_%010d", replicaTablet.TabletUID))
socketFile := path.Join(tabletDir, "mysql.sock")
pidFile := path.Join(tabletDir, "mysql.pid")

connParams := mysql.ConnParams{
Uname: "root",
UnixSocket: socketFile,
}
dbcfgs := dbconfigs.NewTestDBConfigs(connParams, connParams, "")
mysqld := mysqlctl.NewMysqld(dbcfgs)
defer mysqld.Close()

t.Run("mysql is alive", func(t *testing.T) {
assert.False(t, mysqld.IsLocalMySQLDown(t.Context()))
})

t.Run("mysql killed with SIGKILL", func(t *testing.T) {
// Restore MySQL after the test so subsequent tests are not affected.
t.Cleanup(func() {
require.NoError(t, replicaTablet.MysqlctlProcess.StartProvideInit(false))
})

pidBytes, err := os.ReadFile(pidFile)
require.NoError(t, err)

pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes)))
require.NoError(t, err)

require.NoError(t, syscall.Kill(pid, syscall.SIGKILL))

// Wait for MySQL to be reported as down. We check IsLocalMySQLDown rather
// than waiting for the socket file to disappear, because SIGKILL bypasses
// cleanup and the socket file may persist on disk.
require.Eventually(t, func() bool {
return mysqld.IsLocalMySQLDown(t.Context())
}, 30*time.Second, 100*time.Millisecond, "MySQL was not reported down after SIGKILL")
})

t.Run("fd exhaustion", func(t *testing.T) {
// Lower the fd limit so we can exhaust fds without opening thousands.
var original syscall.Rlimit
require.NoError(t, syscall.Getrlimit(syscall.RLIMIT_NOFILE, &original))

low := syscall.Rlimit{Cur: 32, Max: original.Max}
require.NoError(t, syscall.Setrlimit(syscall.RLIMIT_NOFILE, &low))
t.Cleanup(func() {
require.NoError(t, syscall.Setrlimit(syscall.RLIMIT_NOFILE, &original))
})

// Consume all remaining fds via Dup.
var fds []int
t.Cleanup(func() {
for _, fd := range fds {
syscall.Close(fd)
}
})
for {
fd, err := syscall.Dup(0)
if err != nil {
break
}
fds = append(fds, fd)
}

assert.False(t, mysqld.IsLocalMySQLDown(t.Context()), "should not report MySQL as down when fds are exhausted")
})
}
11 changes: 11 additions & 0 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ func TestDownPrimary(t *testing.T) {
utils.CheckMetricExists(t, vtOrcProcess, "vtorc_planned_reparent_counts")
utils.CheckMetricExists(t, vtOrcProcess, "vtorc_reparent_shard_operation_timings_bucket")
})

// simulate case where the primary's mysqld and vttablet pods/services get restarted automatically
// because they (or the healthchecks) fail.
err = curPrimary.MysqlctlProcess.StartProvideInit(false)
require.NoError(t, err)
err = curPrimary.VttabletProcess.Setup()
require.NoError(t, err)
// verify the old primary rejoins as a replica with replication working
err = curPrimary.VttabletProcess.WaitForTabletTypes([]string{"replica"})
require.NoError(t, err)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary, crossCellReplica}, 15*time.Second)
}

// bring down primary, with keyspace-level ERS disabled via SetVtorcEmergencyReparent --disable.
Expand Down
16 changes: 16 additions & 0 deletions go/vt/mysqlctl/fakemysqldaemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,12 @@ type FakeMysqlDaemon struct {

// Version is the version that will be returned by GetVersionString.
Version string

// MysqlLocal is used by IsMySQLLocal.
MysqlLocal bool

// MysqlDown is used by IsLocalMySQLDown.
MysqlDown bool
}

// NewFakeMysqlDaemon returns a FakeMysqlDaemon where mysqld appears
Expand All @@ -230,6 +236,16 @@ func (fmd *FakeMysqlDaemon) DB() *fakesqldb.DB {
return fmd.db
}

// IsMySQLLocal is part of the MysqlDaemon interface.
func (fmd *FakeMysqlDaemon) IsMySQLLocal() bool {
return fmd.MysqlLocal
}

// IsLocalMySQLDown is part of the MysqlDaemon interface.
func (fmd *FakeMysqlDaemon) IsLocalMySQLDown(_ context.Context) bool {
return fmd.MysqlDown
}

// Start is part of the MysqlDaemon interface.
func (fmd *FakeMysqlDaemon) Start(ctx context.Context, cnf *Mycnf, mysqldArgs ...string) error {
if fmd.Running {
Expand Down
2 changes: 2 additions & 0 deletions go/vt/mysqlctl/mysql_daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ import (
// MysqlDaemon is the interface we use for abstracting Mysqld.
type MysqlDaemon interface {
// methods related to mysql running or not
IsMySQLLocal() bool
IsLocalMySQLDown(ctx context.Context) bool
Start(ctx context.Context, cnf *Mycnf, mysqldArgs ...string) error
Shutdown(ctx context.Context, cnf *Mycnf, waitForMysqld bool, mysqlShutdownTimeout time.Duration) error
RunMysqlUpgrade(ctx context.Context) error
Expand Down
74 changes: 74 additions & 0 deletions go/vt/mysqlctl/mysqld.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,68 @@ func (mysqld *Mysqld) RunMysqlUpgrade(ctx context.Context) error {
return err
}

// IsMySQLLocal returns true if the DBA connection uses a local unix socket.
func (mysqld *Mysqld) IsMySQLLocal() bool {
params, err := mysqld.dbcfgs.DbaConnector().MysqlParams()
return err == nil && params.UnixSocket != ""
}

// IsLocalMySQLDown probes MySQL by attempting a DBA connection and returns true
// if MySQL appears to be down. Only meaningful when IsMySQLLocal returns true.
func (mysqld *Mysqld) IsLocalMySQLDown(ctx context.Context) bool {
// Test if mysql is available by attempting to establish a DBA connection.
conn, err := mysqld.GetDbaConnection(ctx)
if err == nil {
conn.Close()
return false
}

// "too many connections" proves MySQL is alive.
if sqlerror.IsTooManyConnectionsErr(err) {
return false
}

// Only use CRConnectionError (errno 2002, unix socket) as a signal MySQL is down.
// TCP-based connection errors (errno 2003) may be network-related, not MySQL.
var sqlErr *sqlerror.SQLError
if !errors.As(err, &sqlErr) || sqlErr.Num != sqlerror.CRConnectionError {
return false
}

// File-descriptor exhaustion is client-side; it is not a good signal of MySQL's state.
// It is unfortunately possible for file-descriptor exhaustion to be the cause of the
// CRConnectionError (errno 2002) error.
if isFileDescriptorExhaustedProbe() {
return false
}

// Finally, validate the socket file exists and that it really is a socket.
params, err := mysqld.dbcfgs.DbaConnector().MysqlParams()
if err != nil || params.UnixSocket == "" {
return false
}
fi, sErr := os.Stat(params.UnixSocket)
if sErr != nil && !os.IsNotExist(sErr) {
return false
} else if sErr == nil && fi.Mode()&os.ModeSocket == 0 {
return false
}

// We conclude MySQL is down.
return true
}

// isFileDescriptorExhaustedProbe uses Dup to detect EMFILE/ENFILE,
// since the MySQL connector wraps the original syscall error.
func isFileDescriptorExhaustedProbe() bool {
fd, err := syscall.Dup(0)
if err != nil {
return errors.Is(err, syscall.EMFILE) || errors.Is(err, syscall.ENFILE)
}
syscall.Close(fd)
return false
}

// Start will start the mysql daemon, either by running the
// 'mysqld_start' hook, or by running mysqld_safe in the background.
// If a mysqlctld address is provided in a flag, Start will run
Expand Down Expand Up @@ -645,6 +707,18 @@ func (mysqld *Mysqld) Shutdown(ctx context.Context, cnf *Mycnf, waitForMysqld bo
return nil
}

// Stop replication before shutting down to avoid a brief race in
// MySQL's close_connections() (mysqld.cc) where close_listener()
// removes the unix socket before end_slave() stops replication
// threads. Best-effort with a tight timeout — if MySQL is already
// unreachable we proceed with shutdown regardless.
stopCtx, stopCancel := context.WithTimeout(ctx, 3*time.Second)
defer stopCancel()
if conn, err := getPoolReconnect(stopCtx, mysqld.dbaPool); err == nil {
mysqld.executeSuperQueryListConn(stopCtx, conn, []string{conn.Conn.StopReplicationCommand()})
conn.Recycle()
}

// try the preflight mysqld shutdown hook, if any
h := hook.NewSimpleHook("preflight_mysqld_shutdown")
hr := h.ExecuteContext(ctx)
Expand Down
77 changes: 77 additions & 0 deletions go/vt/mysqlctl/mysqld_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ import (
"os"
"strconv"
"strings"
"sync"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/mysql/fakesqldb"
"vitess.io/vitess/go/sqltypes"
"vitess.io/vitess/go/vt/dbconfigs"
Expand Down Expand Up @@ -355,3 +357,78 @@ func TestBuildLdPathsTZ(t *testing.T) {
assert.NoError(t, err)
assert.Contains(t, env, "TZ=Europe/Berlin")
}

func TestMysqldIsMySQLLocal(t *testing.T) {
t.Run("unix socket", func(t *testing.T) {
db := fakesqldb.New(t)
defer db.Close()
cp := *db.ConnParams()
dbc := dbconfigs.NewTestDBConfigs(cp, cp, "fakesqldb")
mysqld := NewMysqld(dbc)
defer mysqld.Close()
assert.True(t, mysqld.IsMySQLLocal())
})

t.Run("tcp", func(t *testing.T) {
cp := mysql.ConnParams{
Host: "127.0.0.1",
Port: 1,
}
dbc := dbconfigs.NewTestDBConfigs(cp, cp, "")
mysqld := NewMysqld(dbc)
defer mysqld.Close()
assert.False(t, mysqld.IsMySQLLocal())
})
}

func TestMysqldIsLocalMySQLDown(t *testing.T) {
db := fakesqldb.New(t)
var closeOnce sync.Once
closeDB := func() { closeOnce.Do(db.Close) }
t.Cleanup(closeDB)

params := db.ConnParams()
cp := *params
dbc := dbconfigs.NewTestDBConfigs(cp, cp, "fakesqldb")

mysqld := NewMysqld(dbc)
defer mysqld.Close()

t.Run("mysql is reachable", func(t *testing.T) {
assert.False(t, mysqld.IsLocalMySQLDown(context.Background()))
})

t.Run("mysql is down", func(t *testing.T) {
// Close the fake MySQL server to simulate MySQL being down.
closeDB()

assert.True(t, mysqld.IsLocalMySQLDown(context.Background()))
})
}

func TestShutdownStopsReplication(t *testing.T) {
db := fakesqldb.New(t)
defer db.Close()

db.AddQuery("SELECT 1", &sqltypes.Result{})
db.AddQuery("STOP REPLICA", &sqltypes.Result{})

cp := *db.ConnParams()
dbc := dbconfigs.NewTestDBConfigs(cp, cp, "fakesqldb")
mysqld := NewMysqld(dbc)
defer mysqld.Close()

// Create fake socket and pid files so Shutdown doesn't bail early.
cnf := &Mycnf{
SocketFile: t.TempDir() + "/mysql.sock",
PidFile: t.TempDir() + "/mysql.pid",
}
require.NoError(t, os.WriteFile(cnf.SocketFile, nil, 0o600))
require.NoError(t, os.WriteFile(cnf.PidFile, nil, 0o600))

// Shutdown will fail at the mysqladmin step, but STOP REPLICA
// happens before that.
mysqld.Shutdown(context.Background(), cnf, false, 1*time.Second)

assert.Contains(t, db.QueryLog(), "stop replica")
}
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func (tm *TabletManager) Backup(ctx context.Context, logger logutil.Logger, req
l.Errorf("Failed to convert bool to semisync action, error: %v", err)
return
}
if err := tm.setReplicationSourceLocked(bgCtx, shardPrimary.Alias, 0, "", false, semiSyncAction, 0); err != nil {
if err := tm.setReplicationSourceLocked(bgCtx, shardPrimary.Alias, 0, "", false, semiSyncAction, 0, false); err != nil {
l.Errorf("Failed to set replication source, error: %v", err)
}
}()
Expand Down
Loading
Loading