Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,14 @@ func ElectNewPrimary(
tb := tablet
errorGroup.Go(func() error {
// find and store the positions for the tablet
pos, replLag, takingBackup, err := findTabletPositionLagBackupStatus(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
pos, replLag, takingBackup, replUnknown, err := findTabletPositionLagBackupStatus(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
mu.Lock()
defer mu.Unlock()
if err == nil && (opts.TolerableReplLag == 0 || opts.TolerableReplLag >= replLag) {
if takingBackup {
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v is taking a backup", topoproto.TabletAliasString(tablet.Alias)))
} else if replUnknown {
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v position known but unknown replication status", topoproto.TabletAliasString(tablet.Alias)))
} else {
validTablets = append(validTablets, tb)
tabletPositions = append(tabletPositions, pos)
Expand Down Expand Up @@ -167,7 +169,7 @@ func ElectNewPrimary(

// findTabletPositionLagBackupStatus processes the replication position and lag for a single tablet and
// returns it. It is safe to call from multiple goroutines.
func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) {
func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, bool, error) {
logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias))

ctx, cancel := context.WithTimeout(ctx, waitTimeout)
Expand All @@ -178,10 +180,10 @@ func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.T
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
logger.Warningf("no replication statue from %v, using empty gtid set", topoproto.TabletAliasString(tablet.Alias))
return replication.Position{}, 0, false, nil
return replication.Position{}, 0, false, false, nil
}
logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err)
return replication.Position{}, 0, false, err
return replication.Position{}, 0, false, false, err
}

// Use the relay log position if available, otherwise use the executed GTID set (binary log position).
Expand All @@ -192,10 +194,10 @@ func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.T
pos, err := replication.DecodePosition(positionString)
if err != nil {
logger.Warningf("cannot decode replica position %v for tablet %v, ignoring tablet: %v", positionString, topoproto.TabletAliasString(tablet.Alias), err)
return replication.Position{}, 0, status.BackupRunning, err
return replication.Position{}, 0, status.BackupRunning, false, err
}

return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.BackupRunning, nil
return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.BackupRunning, status.ReplicationLagUnknown, nil
}

// FindCurrentPrimary returns the current primary tablet of a shard, if any. The
Expand Down
150 changes: 142 additions & 8 deletions go/vt/vtctl/reparentutil/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,114 @@ func TestElectNewPrimary(t *testing.T) {
},
errContains: nil,
},
{
name: "more advanced replica has an unknown replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000101": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
ReplicationLagSeconds: 10,
},
"zone1-0000000102": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
ReplicationLagUnknown: true,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
"replica2": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 102,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
avoidPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 0,
},
expected: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
errContains: nil,
},
{
name: "replica with unknown replication lag",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
replicationStatuses: map[string]*replicationdatapb.Status{
"zone1-0000000101": {
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
ReplicationLagUnknown: true,
},
},
},
tolerableReplLag: 50 * time.Second,
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
PrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, nil),
tabletMap: map[string]*topo.TabletInfo{
"primary": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
Type: topodatapb.TabletType_PRIMARY,
},
},
"replica1": {
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
Type: topodatapb.TabletType_REPLICA,
},
},
},
avoidPrimaryAlias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 0,
},
expected: nil,
errContains: []string{
"zone1-0000000101 position known but unknown replication status",
},
},
{
name: "Two good replicas, but one of them is taking a backup so we pick the other one",
tmc: &chooseNewPrimaryTestTMClient{
Expand Down Expand Up @@ -1049,13 +1157,14 @@ func TestFindPositionForTablet(t *testing.T) {
ctx := context.Background()
logger := logutil.NewMemoryLogger()
tests := []struct {
name string
tmc *testutil.TabletManagerClient
tablet *topodatapb.Tablet
expectedPosition string
expectedLag time.Duration
expectedErr string
expectedTakingBackup bool
name string
tmc *testutil.TabletManagerClient
tablet *topodatapb.Tablet
expectedPosition string
expectedLag time.Duration
expectedErr string
expectedTakingBackup bool
expectedUnknownReplLag bool
}{
{
name: "executed gtid set",
Expand Down Expand Up @@ -1170,12 +1279,36 @@ func TestFindPositionForTablet(t *testing.T) {
},
},
expectedErr: `parse error: unknown GTIDSet flavor ""`,
}, {
name: "unknown replication lag",
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Error error
}{
"zone1-0000000100": {
Position: &replicationdatapb.Status{
RelayLogPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
ReplicationLagUnknown: true,
},
},
},
},
tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
expectedLag: 0,
expectedPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
expectedUnknownReplLag: true,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
pos, lag, takingBackup, err := findTabletPositionLagBackupStatus(ctx, test.tablet, logger, test.tmc, 10*time.Second)
pos, lag, takingBackup, replUnknown, err := findTabletPositionLagBackupStatus(ctx, test.tablet, logger, test.tmc, 10*time.Second)
if test.expectedErr != "" {
require.EqualError(t, err, test.expectedErr)
return
Expand All @@ -1185,6 +1318,7 @@ func TestFindPositionForTablet(t *testing.T) {
require.Equal(t, test.expectedPosition, posString)
require.Equal(t, test.expectedLag, lag)
require.Equal(t, test.expectedTakingBackup, takingBackup)
require.Equal(t, test.expectedUnknownReplLag, replUnknown)
})
}
}
Expand Down
Loading