From 4ada9da984cac76a6bf22700e9d891cdca654607 Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 11:44:09 -0800 Subject: [PATCH 1/6] Improve error messages when tablets are unreachable during PlannedReparentShard Add detailed context to errors in verifyAllTabletsReachable to help operators identify which specific tablet is unreachable during PRS. Changes: - Show which tablet alias is unreachable - Include timeout value in error message - Distinguish between timeout and other reachability failures This helps quickly identify the problematic tablet instead of just seeing "context deadline exceeded". Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- go/vt/vtctl/reparentutil/planned_reparenter.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/go/vt/vtctl/reparentutil/planned_reparenter.go b/go/vt/vtctl/reparentutil/planned_reparenter.go index dcd6dc7c590..7ae52080567 100644 --- a/go/vt/vtctl/reparentutil/planned_reparenter.go +++ b/go/vt/vtctl/reparentutil/planned_reparenter.go @@ -753,10 +753,15 @@ func (pr *PlannedReparenter) verifyAllTabletsReachable(ctx context.Context, tabl errorGroup, groupCtx := errgroup.WithContext(verifyCtx) for tblStr, info := range tabletMap { tablet := info.Tablet + tabletAlias := tblStr errorGroup.Go(func() error { statusValues, err := pr.tmc.GetGlobalStatusVars(groupCtx, tablet, []string{InnodbBufferPoolsDataVar}) if err != nil { - return err + if groupCtx.Err() == context.DeadlineExceeded { + return vterrors.Wrapf(err, "timed out verifying tablet %v is reachable (timeout: %v); all tablets must be reachable for PlannedReparent", + tabletAlias, topo.RemoteOperationTimeout) + } + return vterrors.Wrapf(err, "failed to verify tablet %v is reachable", tabletAlias) } // We are ignoring the error in conversion because some MySQL variants might not have this // status variable like MariaDB. From 469c5753ba83394affd4a717759bed5111ddcc19 Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 12:58:59 -0800 Subject: [PATCH 2/6] Remove duplicate error logging in legacy vtctl commands The error was being logged three times: 1. fmt.Printf in legacy_shim.go (user-friendly output) 2. log.Error in legacy_shim.go (duplicate) 3. log.Error in main.go (catch-all for all commands) Removed the duplicate log.Error call in legacy_shim.go and the unused log import. Now errors are only logged once with timestamp plus the user-friendly Printf output. Before: Error: rpc error... E0130 12:55:57.535295 1766589 legacy_shim.go:97] remote error: rpc error... E0130 12:55:57.539447 1766589 main.go:56] remote error: rpc error... After: Error: rpc error... E0130 12:55:57.539447 1766589 main.go:56] remote error: rpc error... Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- go/cmd/vtctldclient/command/legacy_shim.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go/cmd/vtctldclient/command/legacy_shim.go b/go/cmd/vtctldclient/command/legacy_shim.go index 71ae9cca73a..cb035ef4bdf 100644 --- a/go/cmd/vtctldclient/command/legacy_shim.go +++ b/go/cmd/vtctldclient/command/legacy_shim.go @@ -25,7 +25,6 @@ import ( "vitess.io/vitess/go/cmd/vtctldclient/cli" "vitess.io/vitess/go/internal/flag" - "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/vtctl/vtctlclient" @@ -94,7 +93,7 @@ func runLegacyCommand(ctx context.Context, args []string) error { errStr := strings.ReplaceAll(err.Error(), "remote error: ", "") fmt.Printf("%s Error: %s\n", flag.Arg(0), errStr) - log.Error(err) + // Don't log here - error will be logged by main.go to avoid duplicate logging } return err From 12fd4600978209976840672e9a4363032356499c Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 13:01:33 -0800 Subject: [PATCH 3/6] Show error only once in legacy vtctl commands Previously errors were shown twice: 1. fmt.Printf in legacy_shim.go (user-friendly output) 2. log.Error in main.go (timestamped log entry) Now only the user-friendly error message is shown. The command still exits with code 1 on errors by calling exit.Return(1), but returns nil to prevent main.go from logging the error again. Before: Error: rpc error: code = Unknown desc = timed out... E0130 12:55:57.539447 1766589 main.go:56] remote error: rpc error... After: Error: rpc error: code = Unknown desc = timed out... Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- go/cmd/vtctldclient/command/legacy_shim.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/go/cmd/vtctldclient/command/legacy_shim.go b/go/cmd/vtctldclient/command/legacy_shim.go index cb035ef4bdf..de4c459b819 100644 --- a/go/cmd/vtctldclient/command/legacy_shim.go +++ b/go/cmd/vtctldclient/command/legacy_shim.go @@ -24,6 +24,7 @@ import ( "github.com/spf13/cobra" "vitess.io/vitess/go/cmd/vtctldclient/cli" + "vitess.io/vitess/go/exit" "vitess.io/vitess/go/internal/flag" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/vtctl/vtctlclient" @@ -93,10 +94,12 @@ func runLegacyCommand(ctx context.Context, args []string) error { errStr := strings.ReplaceAll(err.Error(), "remote error: ", "") fmt.Printf("%s Error: %s\n", flag.Arg(0), errStr) - // Don't log here - error will be logged by main.go to avoid duplicate logging + // Exit with error code 1, but return nil to prevent duplicate logging in main.go + exit.Return(1) + return nil } - return err + return nil } func init() { From c1dbed08057012576c43bfcafe2079ddd9bf921b Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 13:05:32 -0800 Subject: [PATCH 4/6] Revert changes to legacy_shim.go Restore the original error logging behavior in legacy vtctl commands. Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- go/cmd/vtctldclient/command/legacy_shim.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/go/cmd/vtctldclient/command/legacy_shim.go b/go/cmd/vtctldclient/command/legacy_shim.go index de4c459b819..71ae9cca73a 100644 --- a/go/cmd/vtctldclient/command/legacy_shim.go +++ b/go/cmd/vtctldclient/command/legacy_shim.go @@ -24,8 +24,8 @@ import ( "github.com/spf13/cobra" "vitess.io/vitess/go/cmd/vtctldclient/cli" - "vitess.io/vitess/go/exit" "vitess.io/vitess/go/internal/flag" + "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" "vitess.io/vitess/go/vt/vtctl/vtctlclient" @@ -94,12 +94,10 @@ func runLegacyCommand(ctx context.Context, args []string) error { errStr := strings.ReplaceAll(err.Error(), "remote error: ", "") fmt.Printf("%s Error: %s\n", flag.Arg(0), errStr) - // Exit with error code 1, but return nil to prevent duplicate logging in main.go - exit.Return(1) - return nil + log.Error(err) } - return nil + return err } func init() { From 6c61df89dce5c93bfd5420818fe0326643540342 Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 14:16:05 -0800 Subject: [PATCH 5/6] Update test expectation for improved error message Update TestPlannedReparentShard/tablet_unreachable to expect the new improved error message that includes the tablet alias. Before: "global status vars failed" After: "failed to verify tablet zone1-0000000200 is reachable: global status vars failed" This matches the improvement made to verifyAllTabletsReachable in planned_reparenter.go which adds context about which tablet failed. Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- go/vt/vtctl/grpcvtctldserver/server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtctl/grpcvtctldserver/server_test.go b/go/vt/vtctl/grpcvtctldserver/server_test.go index 180d4b92622..4b91153ab3b 100644 --- a/go/vt/vtctl/grpcvtctldserver/server_test.go +++ b/go/vt/vtctl/grpcvtctldserver/server_test.go @@ -9029,7 +9029,7 @@ func TestPlannedReparentShard(t *testing.T) { WaitReplicasTimeout: protoutil.DurationToProto(time.Millisecond * 10), }, expectEventsToOccur: true, - expectedErr: "global status vars failed", + expectedErr: "failed to verify tablet zone1-0000000200 is reachable: global status vars failed", }, } From 6ec3c93111df7100d69f0ea44d16de5b63729bca Mon Sep 17 00:00:00 2001 From: Tanjin Xu Date: Fri, 30 Jan 2026 15:00:13 -0800 Subject: [PATCH 6/6] Move verbose ERS operational logs from client to server logs Changed verbose operational logs in EmergencyReparentShard from erp.logger.Infof() to log.Infof() so they stay in vtctld server logs instead of streaming back to the CLI client. This dramatically reduces CLI output verbosity by keeping operational details on the server, while still showing important user-facing messages like errors and warnings. Changed logs: - "will initiate emergency reparent shard..." - "Getting a new durability policy..." - "getting replication position from..." - "started finding the intermediate source" - "finding intermediate source - sorted replica..." - "intermediate source selected..." - "intermediate source is ideal candidate..." - "setting up %v as new primary for uninitialized cluster" - "starting promotion for the new primary..." - "populating reparent journal on new primary..." - "setting new primary on replica..." - "found better candidate..." - "Removing %s from list of valid candidates..." (various reasons) - "Setting %s in list of valid candidates taking a backup" - "replica %v thinks it's primary but we failed to demote it..." Co-Authored-By: Claude Signed-off-by: Tanjin Xu --- .../reparentutil/emergency_reparenter.go | 30 +++++++++---------- go/vt/vtctl/reparentutil/replication.go | 4 +-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index f2042ea38df..c702f9c43c8 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -146,7 +146,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet // reparentShardLocked performs Emergency Reparent Shard operation assuming that the shard is already locked func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) { // log the starting of the operation and increment the counter - erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) + log.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard) var ( stoppedReplicationSnapshot *replicationSnapshot @@ -180,7 +180,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve return err } - erp.logger.Infof("Getting a new durability policy for %v", keyspaceDurability) + log.Infof("Getting a new durability policy for %v", keyspaceDurability) opts.durability, err = policy.GetDurabilityPolicy(keyspaceDurability) if err != nil { return err @@ -251,7 +251,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve if err != nil { return err } - erp.logger.Infof("intermediate source selected - %v", intermediateSource.Alias) + log.Infof("intermediate source selected - %v", intermediateSource.Alias) // After finding the intermediate source, we want to filter the valid candidate list by the following criteria - // 1. Only keep the tablets which can make progress after being promoted (have sufficient reachable semi-sync ackers) @@ -269,7 +269,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve if err != nil { return err } - erp.logger.Infof("intermediate source is ideal candidate- %v", isIdeal) + log.Infof("intermediate source is ideal candidate- %v", isIdeal) // Check (again) we still have the topology lock. if err := topo.CheckShardLocked(ctx, keyspace, shard); err != nil { @@ -399,7 +399,7 @@ func (erp *EmergencyReparenter) findMostAdvanced( tabletMap map[string]*topo.TabletInfo, opts EmergencyReparentOptions, ) (*topodatapb.Tablet, []*topodatapb.Tablet, error) { - erp.logger.Infof("started finding the intermediate source") + log.Infof("started finding the intermediate source") // convert the valid candidates into a list so that we can use it for sorting validTablets, tabletPositions, err := getValidCandidatesAndPositionsAsList(validCandidates, tabletMap) if err != nil { @@ -412,7 +412,7 @@ func (erp *EmergencyReparenter) findMostAdvanced( return nil, nil, err } for _, tablet := range validTablets { - erp.logger.Infof("finding intermediate source - sorted replica: %v", tablet.Alias) + log.Infof("finding intermediate source - sorted replica: %v", tablet.Alias) } // The first tablet in the sorted list will be the most eligible candidate unless explicitly asked for some other tablet @@ -537,18 +537,18 @@ func (erp *EmergencyReparenter) reparentReplicas( var position string var err error if ev.ShardInfo.PrimaryAlias == nil { - erp.logger.Infof("setting up %v as new primary for an uninitialized cluster", alias) + log.Infof("setting up %v as new primary for an uninitialized cluster", alias) // we call InitPrimary when the PrimaryAlias in the ShardInfo is empty. This happens when we have an uninitialized cluster. position, err = erp.tmc.InitPrimary(primaryCtx, tablet, policy.SemiSyncAckers(opts.durability, tablet) > 0) } else { - erp.logger.Infof("starting promotion for the new primary - %v", alias) + log.Infof("starting promotion for the new primary - %v", alias) // we call PromoteReplica which changes the tablet type, fixes the semi-sync, set the primary to read-write and flushes the binlogs position, err = erp.tmc.PromoteReplica(primaryCtx, tablet, policy.SemiSyncAckers(opts.durability, tablet) > 0) } if err != nil { return vterrors.Wrapf(err, "primary-elect tablet %v failed to be upgraded to primary: %v", alias, err) } - erp.logger.Infof("populating reparent journal on new primary %v", alias) + log.Infof("populating reparent journal on new primary %v", alias) err = erp.tmc.PopulateReparentJournal(primaryCtx, tablet, now, opts.lockAction, tablet.Alias, position) if err != nil { return vterrors.Wrapf(err, "failed to PopulateReparentJournal on primary: %v", err) @@ -559,7 +559,7 @@ func (erp *EmergencyReparenter) reparentReplicas( handleReplica := func(alias string, ti *topo.TabletInfo) { defer replWg.Done() - erp.logger.Infof("setting new primary on replica %v", alias) + log.Infof("setting new primary on replica %v", alias) forceStart := false if status, ok := statusMap[alias]; ok { @@ -694,7 +694,7 @@ func (erp *EmergencyReparenter) identifyPrimaryCandidate( ) (candidate *topodatapb.Tablet, err error) { defer func() { if candidate != nil { - erp.logger.Infof("found better candidate - %v", candidate.Alias) + log.Infof("found better candidate - %v", candidate.Alias) } }() @@ -747,7 +747,7 @@ func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb tabletAliasStr := topoproto.TabletAliasString(tablet.Alias) // Remove tablets which have MustNot promote rule since they must never be promoted if policy.PromotionRule(opts.durability, tablet) == promotionrule.MustNot { - erp.logger.Infof("Removing %s from list of valid candidates for promotion because it has the Must Not promote rule", tabletAliasStr) + log.Infof("Removing %s from list of valid candidates for promotion because it has the Must Not promote rule", tabletAliasStr) if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s has a must not promotion rule", topoproto.TabletAliasString(opts.NewPrimaryAlias)) } @@ -755,7 +755,7 @@ func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb } // If ERS is configured to prevent cross cell promotions, remove any tablet not from the same cell as the previous primary if opts.PreventCrossCellPromotion && prevPrimary != nil && tablet.Alias.Cell != prevPrimary.Alias.Cell { - erp.logger.Infof("Removing %s from list of valid candidates for promotion because it isn't in the same cell as the previous primary", tabletAliasStr) + log.Infof("Removing %s from list of valid candidates for promotion because it isn't in the same cell as the previous primary", tabletAliasStr) if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s is is a different cell as the previous primary", topoproto.TabletAliasString(opts.NewPrimaryAlias)) } @@ -763,7 +763,7 @@ func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb } // Remove any tablet which cannot make forward progress using the list of tablets we have reached if !canEstablishForTablet(opts.durability, tablet, tabletsReachable) { - erp.logger.Infof("Removing %s from list of valid candidates for promotion because it will not be able to make forward progress on promotion with the tablets currently reachable", tabletAliasStr) + log.Infof("Removing %s from list of valid candidates for promotion because it will not be able to make forward progress on promotion with the tablets currently reachable", tabletAliasStr) if opts.NewPrimaryAlias != nil && topoproto.TabletAliasEqual(opts.NewPrimaryAlias, tablet.Alias) { return nil, vterrors.Errorf(vtrpc.Code_ABORTED, "proposed primary %s will not be able to make forward progress on being promoted", topoproto.TabletAliasString(opts.NewPrimaryAlias)) } @@ -772,7 +772,7 @@ func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb // Put candidates that are running a backup in a separate list backingUp, ok := tabletsBackupState[tabletAliasStr] if ok && backingUp { - erp.logger.Infof("Setting %s in list of valid candidates taking a backup", tabletAliasStr) + log.Infof("Setting %s in list of valid candidates taking a backup", tabletAliasStr) notPreferredValidTablets = append(notPreferredValidTablets, tablet) } else { restrictedValidTablets = append(restrictedValidTablets, tablet) diff --git a/go/vt/vtctl/reparentutil/replication.go b/go/vt/vtctl/reparentutil/replication.go index 096cb7166ee..8624a6bcf9c 100644 --- a/go/vt/vtctl/reparentutil/replication.go +++ b/go/vt/vtctl/reparentutil/replication.go @@ -247,7 +247,7 @@ func stopReplicationAndBuildStatusMaps( errChan <- concurrencyErr }() - logger.Infof("getting replication position from %v", alias) + log.Infof("getting replication position from %v", alias) stopReplicationStatus, err := tmc.StopReplicationAndGetStatus(groupCtx, tabletInfo.Tablet, replicationdatapb.StopReplicationMode_IOTHREADONLY) if err != nil { @@ -260,7 +260,7 @@ func stopReplicationAndBuildStatusMaps( msg := "replica %v thinks it's primary but we failed to demote it: %v" err = vterrors.Wrapf(err, msg, alias, err) - logger.Warningf(msg, alias, err) + log.Warningf(msg, alias, err) return }