diff --git a/changelog/24.0/24.0.0/summary.md b/changelog/24.0/24.0.0/summary.md
index 4995b64117b..5b840e64209 100644
--- a/changelog/24.0/24.0.0/summary.md
+++ b/changelog/24.0/24.0.0/summary.md
@@ -6,6 +6,8 @@
- **[Minor Changes](#minor-changes)**
- **[VTGate](#minor-changes-vtgate)**
- [New default for `--legacy-replication-lag-algorithm` flag](#vtgate-new-default-legacy-replication-lag-algorithm)
+ - **[VTTablet](#minor-changes-vttablet)**
+ - [New Experimental flag `--init-tablet-type-lookup`](#vttablet-init-tablet-type-lookup)
## Minor Changes
@@ -18,3 +20,13 @@ The VTGate flag `--legacy-replication-lag-algorithm` now defaults to `false`, di
Instead, a simpler algorithm purely based on low lag, high lag and minimum number of tablets is used, which has proven to be more stable in many production environments. A detailed explanation of the two approaches [is explained in this code comment](https://github.com/vitessio/vitess/blob/main/go/vt/discovery/replicationlag.go#L125-L149).
In v25 this flag will become deprecated and in the following release it will be removed. In the meantime, the legacy behaviour can be used by setting `--legacy-replication-lag-algorithm=true`. This deprecation is tracked in https://github.com/vitessio/vitess/issues/18914.
+
+### VTTablet
+
+#### New Experimental flag `--init-tablet-type-lookup`
+
+The new experimental flag `--init-tablet-type-lookup` for VTTablet allows tablets to automatically restore their previous tablet type on restart by looking up the existing topology record, rather than always using the static `--init-tablet-type` value.
+
+When enabled, the tablet uses its alias to look up the tablet type from the existing topology record on restart. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts without manual reconfiguration. If disabled or if no topology record exists, the standard `--init-tablet-type` value will be used instead.
+
+**Note**: Vitess Operator–managed deployments generally do not keep tablet records in the topo between restarts, so this feature will not take effect in those environments.
diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt
index 28ce2b31260..cf66e8c84f2 100644
--- a/go/flags/endtoend/vtcombo.txt
+++ b/go/flags/endtoend/vtcombo.txt
@@ -174,7 +174,8 @@ Flags:
--init-db-name-override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_
--init-keyspace string (init parameter) keyspace to use for this tablet
--init-shard string (init parameter) shard to use for this tablet
- --init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.
+ --init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.
+ --init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.
--init-tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
--init-timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
index 0b2e911768c..2a0cd50e65a 100644
--- a/go/flags/endtoend/vttablet.txt
+++ b/go/flags/endtoend/vttablet.txt
@@ -201,7 +201,8 @@ Flags:
--init-db-name-override string (init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_
--init-keyspace string (init parameter) keyspace to use for this tablet
--init-shard string (init parameter) shard to use for this tablet
- --init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.
+ --init-tablet-type string (init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.
+ --init-tablet-type-lookup (Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.
--init-tags StringMap (init parameter) comma separated list of key:value pairs used to tag the tablet
--init-timeout duration (init parameter) timeout to use for the init phase. (default 1m0s)
--jaeger-agent-host string host and port to send spans to. if empty, no tracing will be done
diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go
index a3c84770ec3..11ce7731183 100644
--- a/go/vt/vttablet/tabletmanager/tm_init.go
+++ b/go/vt/vttablet/tabletmanager/tm_init.go
@@ -89,13 +89,14 @@ const (
var (
// The following flags initialize the tablet record.
- tabletHostname string
- initKeyspace string
- initShard string
- initTabletType string
- initDbNameOverride string
- skipBuildInfoTags = "/.*/"
- initTags flagutil.StringMapValue
+ tabletHostname string
+ initKeyspace string
+ initShard string
+ initTabletType string
+ initTabletTypeLookup bool
+ initDbNameOverride string
+ skipBuildInfoTags = "/.*/"
+ initTags flagutil.StringMapValue
initTimeout = 1 * time.Minute
mysqlShutdownTimeout = mysqlctl.DefaultShutdownTimeout
@@ -105,7 +106,8 @@ func registerInitFlags(fs *pflag.FlagSet) {
utils.SetFlagStringVar(fs, &tabletHostname, "tablet-hostname", tabletHostname, "if not empty, this hostname will be assumed instead of trying to resolve it")
utils.SetFlagStringVar(fs, &initKeyspace, "init-keyspace", initKeyspace, "(init parameter) keyspace to use for this tablet")
utils.SetFlagStringVar(fs, &initShard, "init-shard", initShard, "(init parameter) shard to use for this tablet")
- utils.SetFlagStringVar(fs, &initTabletType, "init-tablet-type", initTabletType, "(init parameter) tablet type to use for this tablet. Valid values are: PRIMARY, REPLICA, SPARE, and RDONLY. The default is REPLICA.")
+ utils.SetFlagStringVar(fs, &initTabletType, "init-tablet-type", initTabletType, "(init parameter) tablet type to use for this tablet. Valid values are: REPLICA, RDONLY, and SPARE. The default is REPLICA.")
+ fs.BoolVar(&initTabletTypeLookup, "init-tablet-type-lookup", initTabletTypeLookup, "(Experimental, init parameter) if enabled, uses tablet alias to look up the tablet type from the existing topology record on restart and use that instead of init-tablet-type. This allows tablets to maintain their changed roles (e.g., RDONLY/DRAINED) across restarts. If disabled or if no topology record exists, init-tablet-type will be used.")
utils.SetFlagStringVar(fs, &initDbNameOverride, "init-db-name-override", initDbNameOverride, "(init parameter) override the name of the db used by vttablet. Without this flag, the db name defaults to vt_")
utils.SetFlagStringVar(fs, &skipBuildInfoTags, "vttablet-skip-buildinfo-tags", skipBuildInfoTags, "comma-separated list of buildinfo tags to skip from merging with --init-tags. each tag is either an exact match or a regular expression of the form '/regexp/'.")
utils.SetFlagVar(fs, &initTags, "init-tags", "(init parameter) comma separated list of key:value pairs used to tag the tablet")
@@ -372,6 +374,42 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, config *tabletenv.Tabl
tm.DBConfigs.DBName = topoproto.TabletDbName(tablet)
tm.tabletAlias = tablet.Alias
tm.tmc = tmclient.NewTabletManagerClient()
+
+ // Check if there's an existing tablet record in topology and use it if flag is enabled
+ if initTabletTypeLookup {
+ ctx, cancel := context.WithTimeout(tm.BatchCtx, initTimeout)
+ defer cancel()
+ existingTablet, err := tm.TopoServer.GetTablet(ctx, tablet.Alias)
+ if err != nil && !topo.IsErrType(err, topo.NoNode) {
+ // Error other than "node doesn't exist" - return it
+ return vterrors.Wrap(err, "--init-tablet-type-lookup is enabled but failed to get existing tablet record from topology, unable to determine tablet type during startup")
+ }
+
+ // If we found an existing tablet record, determine which type to use
+ switch {
+ case err != nil:
+ // No existing tablet record found, use init-tablet-type
+ log.Infof("No existing tablet record found, using init-tablet-type: %v", tablet.Type)
+ case existingTablet.Type == topodatapb.TabletType_PRIMARY:
+ // Don't set to PRIMARY yet - let checkPrimaryShip() validate and decide
+ // checkPrimaryShip() has the logic to verify shard records and determine if this tablet should really be PRIMARY
+ log.Infof("Found existing tablet record with PRIMARY type, setting to REPLICA and allowing checkPrimaryShip() to validate")
+ tablet.Type = topodatapb.TabletType_REPLICA
+ case existingTablet.Type == topodatapb.TabletType_BACKUP || existingTablet.Type == topodatapb.TabletType_RESTORE:
+ // Skip transient operational types (BACKUP, RESTORE)
+ // These are temporary states that should not be preserved across restarts
+ log.Infof("Found existing tablet record with transient type %v, using init-tablet-type %v instead",
+ existingTablet.Type, tablet.Type)
+ default:
+ // Safe to restore the type for non-PRIMARY, non-transient types
+ log.Infof("Found existing tablet record with --init-tablet-type-lookup enabled, using tablet type %v from topology instead of init-tablet-type %v",
+ existingTablet.Type, tablet.Type)
+ tablet.Type = existingTablet.Type
+ }
+ } else {
+ log.Infof("Using init-tablet-type %v", tablet.Type)
+ }
+
tm.tmState = newTMState(tm, tablet)
tm.actionSema = semaphore.NewWeighted(1)
tm._waitForGrantsComplete = make(chan struct{})
diff --git a/go/vt/vttablet/tabletmanager/tm_init_test.go b/go/vt/vttablet/tabletmanager/tm_init_test.go
index 3d8b9fd132f..3758ab16fbb 100644
--- a/go/vt/vttablet/tabletmanager/tm_init_test.go
+++ b/go/vt/vttablet/tabletmanager/tm_init_test.go
@@ -961,3 +961,280 @@ func grantAllPrivilegesToUser(t *testing.T, connParams mysql.ConnParams, testUse
require.NoError(t, err)
conn.Close()
}
+
+func TestInitTabletTypeLookup_PreservesTabletTypes(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ tests := []struct {
+ name string
+ preservedType topodatapb.TabletType
+ }{
+ {
+ name: "RDONLY",
+ preservedType: topodatapb.TabletType_RDONLY,
+ },
+ {
+ name: "DRAINED",
+ preservedType: topodatapb.TabletType_DRAINED,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // 1. Initialize tablet as REPLICA (normal startup) with flag disabled
+ initTabletTypeLookup = false
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ tablet := tm.Tablet()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ tm.Stop()
+
+ // 2. Simulate operator changing tablet type in topology
+ _, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
+ t.Type = tt.preservedType
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 3. Restart with flag enabled - should preserve the tablet type
+ initTabletTypeLookup = true
+ err = tm.Start(tablet, nil)
+ require.NoError(t, err)
+ defer tm.Stop()
+ ti, err = ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, tt.preservedType, ti.Type)
+ })
+ }
+}
+
+func TestInitTabletTypeLookup_PreservesPrimaryWithTermTime(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // 1. Initialize tablet as REPLICA with flag disabled
+ initTabletTypeLookup = false
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ tablet := tm.Tablet()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ tm.Stop()
+
+ // 2. Simulate promotion to PRIMARY with a specific term start time
+ now := time.Now()
+ _, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
+ t.Type = topodatapb.TabletType_PRIMARY
+ t.PrimaryTermStartTime = protoutil.TimeToProto(now)
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 3. Update shard's PrimaryAlias to point to this tablet so checkPrimaryShip will promote it
+ _, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
+ si.PrimaryAlias = alias
+ si.PrimaryTermStartTime = protoutil.TimeToProto(now)
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 4. Restart with flag enabled - should set to REPLICA initially, then checkPrimaryShip promotes to PRIMARY
+ initTabletTypeLookup = true
+ err = tm.Start(tablet, nil)
+ require.NoError(t, err)
+ defer tm.Stop()
+ ti, err = ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ // Should be promoted to PRIMARY by checkPrimaryShip and preserve the term start time
+ assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
+ assert.Equal(t, now.Unix(), ti.GetPrimaryTermStartTime().Unix())
+}
+
+func TestInitTabletTypeLookup_FallbackWhenNoTopoRecord(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // Start new tablet with flag enabled but no existing topo record
+ initTabletTypeLookup = true
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ defer tm.Stop()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ // Should use initTabletType (REPLICA)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+}
+
+func TestInitTabletTypeLookup_DisabledUsesInitType(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // 1. Initialize tablet as REPLICA with flag disabled
+ initTabletTypeLookup = false
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ tablet := tm.Tablet()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ tm.Stop()
+
+ // 2. Simulate operator changing tablet to RDONLY in topology
+ _, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
+ t.Type = topodatapb.TabletType_RDONLY
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 3. Restart with flag still disabled - should use initTabletType (REPLICA)
+ initTabletTypeLookup = false
+ err = tm.Start(tablet, nil)
+ require.NoError(t, err)
+ defer tm.Stop()
+ ti, err = ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ // Topo record should be overwritten with REPLICA
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+}
+
+func TestInitTabletTypeLookup_SkipsTransientTypes(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ tests := []struct {
+ name string
+ transientType topodatapb.TabletType
+ }{
+ {
+ name: "BACKUP",
+ transientType: topodatapb.TabletType_BACKUP,
+ },
+ {
+ name: "RESTORE",
+ transientType: topodatapb.TabletType_RESTORE,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // 1. Initialize tablet as REPLICA with flag disabled
+ initTabletTypeLookup = false
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ tablet := tm.Tablet()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ tm.Stop()
+
+ // 2. Simulate crash during backup/restore (tablet type is transient in topo)
+ _, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
+ t.Type = tt.transientType
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 3. Restart with flag enabled - should skip transient type and use initTabletType
+ initTabletTypeLookup = true
+ err = tm.Start(tablet, nil)
+ require.NoError(t, err)
+ defer tm.Stop()
+ ti, err = ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ // Should use initTabletType (REPLICA), not preserve transient type
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ })
+ }
+}
+
+func TestInitTabletTypeLookup_InteractionWithCheckPrimaryShip(t *testing.T) {
+ defer func(saved bool) { initTabletTypeLookup = saved }(initTabletTypeLookup)
+ defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
+ rebuildKeyspaceRetryInterval = 10 * time.Millisecond
+
+ ctx := t.Context()
+ cell := "cell1"
+ ts := memorytopo.NewServer(ctx, cell)
+ alias := &topodatapb.TabletAlias{
+ Cell: "cell1",
+ Uid: 1,
+ }
+
+ // 1. Initialize tablet as REPLICA with flag disabled
+ initTabletTypeLookup = false
+ tm := newTestTM(t, ts, int(alias.Uid), "ks", "0", nil)
+ tablet := tm.Tablet()
+ ensureSrvKeyspace(t, ctx, ts, cell, "ks")
+ ti, err := ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
+ tm.Stop()
+
+ // 2. Set shard's PrimaryAlias to this tablet
+ now := time.Now()
+ _, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
+ si.PrimaryAlias = alias
+ si.PrimaryTermStartTime = protoutil.TimeToProto(now)
+ return nil
+ })
+ require.NoError(t, err)
+
+ // 3. Restart with flag enabled - checkPrimaryShip should still promote to PRIMARY
+ initTabletTypeLookup = true
+ err = tm.Start(tablet, nil)
+ require.NoError(t, err)
+ defer tm.Stop()
+ ti, err = ts.GetTablet(ctx, alias)
+ require.NoError(t, err)
+ // Should be PRIMARY due to checkPrimaryShip logic
+ assert.Equal(t, topodatapb.TabletType_PRIMARY, ti.Type)
+}