diff --git a/go/vt/vtctl/reparentutil/util.go b/go/vt/vtctl/reparentutil/util.go index 4b8a4cbc431..5d2f2e4fddf 100644 --- a/go/vt/vtctl/reparentutil/util.go +++ b/go/vt/vtctl/reparentutil/util.go @@ -128,12 +128,14 @@ func ElectNewPrimary( tb := tablet errorGroup.Go(func() error { // find and store the positions for the tablet - pos, replLag, takingBackup, err := findTabletPositionLagBackupStatus(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout) + pos, replLag, takingBackup, replUnknown, err := findTabletPositionLagBackupStatus(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout) mu.Lock() defer mu.Unlock() if err == nil && (opts.TolerableReplLag == 0 || opts.TolerableReplLag >= replLag) { if takingBackup { reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v is taking a backup", topoproto.TabletAliasString(tablet.Alias))) + } else if replUnknown { + reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v position known but unknown replication status", topoproto.TabletAliasString(tablet.Alias))) } else { validTablets = append(validTablets, tb) tabletPositions = append(tabletPositions, pos) @@ -167,7 +169,7 @@ func ElectNewPrimary( // findTabletPositionLagBackupStatus processes the replication position and lag for a single tablet and // returns it. It is safe to call from multiple goroutines. -func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) { +func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, bool, error) { logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias)) ctx, cancel := context.WithTimeout(ctx, waitTimeout) @@ -178,10 +180,10 @@ func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.T sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError) if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica { logger.Warningf("no replication statue from %v, using empty gtid set", topoproto.TabletAliasString(tablet.Alias)) - return replication.Position{}, 0, false, nil + return replication.Position{}, 0, false, false, nil } logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err) - return replication.Position{}, 0, false, err + return replication.Position{}, 0, false, false, err } // Use the relay log position if available, otherwise use the executed GTID set (binary log position). @@ -192,10 +194,10 @@ func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.T pos, err := replication.DecodePosition(positionString) if err != nil { logger.Warningf("cannot decode replica position %v for tablet %v, ignoring tablet: %v", positionString, topoproto.TabletAliasString(tablet.Alias), err) - return replication.Position{}, 0, status.BackupRunning, err + return replication.Position{}, 0, status.BackupRunning, false, err } - return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.BackupRunning, nil + return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.BackupRunning, status.ReplicationLagUnknown, nil } // FindCurrentPrimary returns the current primary tablet of a shard, if any. The diff --git a/go/vt/vtctl/reparentutil/util_test.go b/go/vt/vtctl/reparentutil/util_test.go index 276bab2e443..5bded8e226c 100644 --- a/go/vt/vtctl/reparentutil/util_test.go +++ b/go/vt/vtctl/reparentutil/util_test.go @@ -140,6 +140,114 @@ func TestElectNewPrimary(t *testing.T) { }, errContains: nil, }, + { + name: "more advanced replica has an unknown replication lag", + tmc: &chooseNewPrimaryTestTMClient{ + // zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101 + replicationStatuses: map[string]*replicationdatapb.Status{ + "zone1-0000000101": { + Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1", + ReplicationLagSeconds: 10, + }, + "zone1-0000000102": { + Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5", + ReplicationLagUnknown: true, + }, + }, + }, + tolerableReplLag: 50 * time.Second, + shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{ + PrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + }, nil), + tabletMap: map[string]*topo.TabletInfo{ + "primary": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "replica1": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + "replica2": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + avoidPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 0, + }, + expected: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + errContains: nil, + }, + { + name: "replica with unknown replication lag", + tmc: &chooseNewPrimaryTestTMClient{ + // zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101 + replicationStatuses: map[string]*replicationdatapb.Status{ + "zone1-0000000101": { + Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1", + ReplicationLagUnknown: true, + }, + }, + }, + tolerableReplLag: 50 * time.Second, + shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{ + PrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + }, nil), + tabletMap: map[string]*topo.TabletInfo{ + "primary": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Type: topodatapb.TabletType_PRIMARY, + }, + }, + "replica1": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + Type: topodatapb.TabletType_REPLICA, + }, + }, + }, + avoidPrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 0, + }, + expected: nil, + errContains: []string{ + "zone1-0000000101 position known but unknown replication status", + }, + }, { name: "Two good replicas, but one of them is taking a backup so we pick the other one", tmc: &chooseNewPrimaryTestTMClient{ @@ -1049,13 +1157,14 @@ func TestFindPositionForTablet(t *testing.T) { ctx := context.Background() logger := logutil.NewMemoryLogger() tests := []struct { - name string - tmc *testutil.TabletManagerClient - tablet *topodatapb.Tablet - expectedPosition string - expectedLag time.Duration - expectedErr string - expectedTakingBackup bool + name string + tmc *testutil.TabletManagerClient + tablet *topodatapb.Tablet + expectedPosition string + expectedLag time.Duration + expectedErr string + expectedTakingBackup bool + expectedUnknownReplLag bool }{ { name: "executed gtid set", @@ -1170,12 +1279,36 @@ func TestFindPositionForTablet(t *testing.T) { }, }, expectedErr: `parse error: unknown GTIDSet flavor ""`, + }, { + name: "unknown replication lag", + tmc: &testutil.TabletManagerClient{ + ReplicationStatusResults: map[string]struct { + Position *replicationdatapb.Status + Error error + }{ + "zone1-0000000100": { + Position: &replicationdatapb.Status{ + RelayLogPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + ReplicationLagUnknown: true, + }, + }, + }, + }, + tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + }, + expectedLag: 0, + expectedPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + expectedUnknownReplLag: true, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - pos, lag, takingBackup, err := findTabletPositionLagBackupStatus(ctx, test.tablet, logger, test.tmc, 10*time.Second) + pos, lag, takingBackup, replUnknown, err := findTabletPositionLagBackupStatus(ctx, test.tablet, logger, test.tmc, 10*time.Second) if test.expectedErr != "" { require.EqualError(t, err, test.expectedErr) return @@ -1185,6 +1318,7 @@ func TestFindPositionForTablet(t *testing.T) { require.Equal(t, test.expectedPosition, posString) require.Equal(t, test.expectedLag, lag) require.Equal(t, test.expectedTakingBackup, takingBackup) + require.Equal(t, test.expectedUnknownReplLag, replUnknown) }) } }