diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 38aab5b9dc2..fd23ccbfcda 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -387,27 +387,37 @@ Cleanup: func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error) { // If the tablet is not replicating from anyone, then it could be the previous primary. // We should check for errant GTIDs by finding the difference with the shard's current primary. - if instance.primaryExecutedGtidSet == "" && instance.SourceHost == "" { - var primaryInstance *Instance - primaryAlias, _, _ := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard) - if primaryAlias != "" { - // Check if the current tablet is the primary. - // If it is, then we don't need to run errant gtid detection on it. - if primaryAlias == instance.InstanceAlias { - return nil - } - primaryInstance, _, _ = ReadInstance(primaryAlias) + primaryAlias, _, err := ReadShardPrimaryInformation(tablet.Keyspace, tablet.Shard) + if err != nil { + return fmt.Errorf("failed to read shard primary for %s/%s: %w", tablet.Keyspace, tablet.Shard, err) + } + + // Check if the current tablet is the primary. If it is, then we don't need to + // run errant GTID detection on it. + if primaryAlias == instance.InstanceAlias { + return nil + } + + var primaryInstance *Instance + if primaryAlias != "" { + primaryInstance, _, err = ReadInstance(primaryAlias) + if err != nil { + return fmt.Errorf("failed to read primary instance %q: %w", primaryAlias, err) } - // Only run errant GTID detection, if we are sure that the data read of the current primary - // is up-to-date enough to reflect that it has been promoted. This is needed to prevent - // flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs - // accepted by the old primary (this tablet) that don't show in the new primary's set. - if primaryInstance != nil { - if primaryInstance.SourceHost == "" { - instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet - } + } + + // Only run errant GTID detection if we are sure that the data read of the current primary + // is up-to-date enough to reflect that it has been promoted. This is needed to prevent + // flagging incorrect errant GTIDs. If we were to use old data, we could have some GTIDs + // accepted by the old primary (this tablet) that don't show in the new primary's set. + if primaryInstance != nil && primaryInstance.SourceHost == "" { + // If the instance has no replication source and no primary GTID set yet, or if the instance's replication + // source is not the primary, use the shard primary's executed GTID set for comparison. + if (instance.SourceHost == "" && instance.primaryExecutedGtidSet == "") || !sourceIsPrimary(instance, primaryInstance) { + instance.primaryExecutedGtidSet = primaryInstance.ExecutedGtidSet } } + if instance.ExecutedGtidSet != "" && instance.primaryExecutedGtidSet != "" { // Compare primary & replica GTID sets, but ignore the sets that present the primary's UUID. // This is because vtorc may pool primary and replica at an inconvenient timing, @@ -445,6 +455,19 @@ func detectErrantGTIDs(instance *Instance, tablet *topodatapb.Tablet) (err error return err } +// sourceIsPrimary returns true if the instance's replication source is the given primary. +func sourceIsPrimary(instance *Instance, primaryInstance *Instance) bool { + if instance.SourceHost == "" { + return false + } + + if instance.SourceUUID != "" && primaryInstance.ServerUUID != "" { + return instance.SourceUUID == primaryInstance.ServerUUID + } + + return instance.SourceHost == primaryInstance.Hostname && instance.SourcePort == primaryInstance.Port +} + // getKeyspaceShardName returns a single string having both the keyspace and shard func getKeyspaceShardName(keyspace, shard string) string { return fmt.Sprintf("%v:%v", keyspace, shard) diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index f6d5aa77ec7..1b9a7c98611 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -891,6 +891,29 @@ func TestExpireTableData(t *testing.T) { } func TestDetectErrantGTIDs(t *testing.T) { + keyspaceName := "ks" + shardName := "0" + tablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Keyspace: keyspaceName, + Shard: shardName, + } + primaryTablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 101, + }, + Keyspace: keyspaceName, + Shard: shardName, + Type: topodatapb.TabletType_PRIMARY, + + MysqlHostname: "primary-host", + MysqlPort: 6714, + } + tests := []struct { name string instance *Instance @@ -940,6 +963,8 @@ func TestDetectErrantGTIDs(t *testing.T) { primaryInstance: &Instance{ SourceHost: "", ExecutedGtidSet: "230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10589,8bc65c84-3fe4-11ed-a912-257f0fcdd6c9:1-34,316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-341", + Hostname: primaryTablet.MysqlHostname, + Port: int(primaryTablet.MysqlPort), }, wantErrantGTID: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:342", }, { @@ -956,24 +981,6 @@ func TestDetectErrantGTIDs(t *testing.T) { }, } - keyspaceName := "ks" - shardName := "0" - tablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone-1", - Uid: 100, - }, - Keyspace: keyspaceName, - Shard: shardName, - } - primaryTablet := &topodatapb.Tablet{ - Alias: &topodatapb.TabletAlias{ - Cell: "zone-1", - Uid: 101, - }, - Keyspace: keyspaceName, - Shard: shardName, - } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Clear the database after the test. The easiest way to do that is to run all the initialization commands again. @@ -990,6 +997,7 @@ func TestDetectErrantGTIDs(t *testing.T) { if tt.primaryInstance != nil { tt.primaryInstance.InstanceAlias = topoproto.TabletAliasString(primaryTablet.Alias) + err = SaveTablet(primaryTablet) require.NoError(t, err) err = WriteInstance(tt.primaryInstance, true, nil) @@ -1008,6 +1016,109 @@ func TestDetectErrantGTIDs(t *testing.T) { } } +// TestDetectErrantGTIDsWithWrongPrimarySource ensures that errant GTIDs are +// detected by comparing against the shard primary when a replica is pointed at +// a different source. +func TestDetectErrantGTIDsWithWrongPrimarySource(t *testing.T) { + defer func() { + db.ClearVTOrcDatabase() + }() + db.ClearVTOrcDatabase() + + keyspaceName := "ks" + shardName := "0" + + primaryUUID := "a5ce8e8a-4e56-11ef-9f7d-92339a7d9f6c" + wrongPrimaryUUID := "bb1db5f6-4e56-11ef-8bda-3ae65c7f7f5e" + replicaUUID := "cc946b58-4e56-11ef-9e6f-3e527c1a9e9d" + + primaryTablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 101, + }, + Keyspace: keyspaceName, + Shard: shardName, + Type: topodatapb.TabletType_PRIMARY, + + MysqlHostname: "primary-host", + MysqlPort: 6714, + } + + replicaTablet := &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone-1", + Uid: 100, + }, + Keyspace: keyspaceName, + Shard: shardName, + } + + err := SaveShard(topo.NewShardInfo(keyspaceName, shardName, &topodatapb.Shard{ + PrimaryAlias: primaryTablet.Alias, + }, nil)) + require.NoError(t, err) + + err = SaveTablet(primaryTablet) + require.NoError(t, err) + + // Create the real shard primary instance and give it a GTID set that does + // not include the replica UUID. This represents the authoritative primary. + primaryInstance := &Instance{ + InstanceAlias: topoproto.TabletAliasString(primaryTablet.Alias), + Hostname: "primary-host", + Port: 6714, + SourceHost: "", + ExecutedGtidSet: primaryUUID + ":1-10", + ServerUUID: primaryUUID, + } + + err = WriteInstance(primaryInstance, true, nil) + require.NoError(t, err) + + // Create a wrong primary instance that contains the replica UUID in its + // executed GTID set. This masks errant GTIDs if the replica uses it for + // comparison + wrongPrimaryInstance := &Instance{ + InstanceAlias: "zone-1-0000000102", + Hostname: "wrong-primary", + Port: 6720, + SourceHost: "", + ExecutedGtidSet: fmt.Sprintf("%s:1-10,%s:1-5", wrongPrimaryUUID, replicaUUID), + ServerUUID: wrongPrimaryUUID, + AncestryUUID: wrongPrimaryUUID, + } + + err = WriteInstance(wrongPrimaryInstance, true, nil) + require.NoError(t, err) + + // Create a replica instance that is pointed at the wrong primary. The replica + // contains errant GTIDs that should be caught. + replicaInstance := &Instance{ + InstanceAlias: topoproto.TabletAliasString(replicaTablet.Alias), + Hostname: "replica-host", + Port: 6711, + SourceHost: wrongPrimaryInstance.Hostname, + SourcePort: wrongPrimaryInstance.Port, + SourceUUID: wrongPrimaryInstance.ServerUUID, + ExecutedGtidSet: fmt.Sprintf("%s:1-10,%s:1-5", wrongPrimaryUUID, replicaUUID), + ServerUUID: replicaUUID, + } + + err = ReadInstanceClusterAttributes(replicaInstance) + require.NoError(t, err) + require.Equal(t, wrongPrimaryInstance.ExecutedGtidSet, replicaInstance.primaryExecutedGtidSet) + require.Equal(t, wrongPrimaryInstance.AncestryUUID, replicaInstance.AncestryUUID) + + // Run errant GTID detection. We should find some. + err = detectErrantGTIDs(replicaInstance, replicaTablet) + require.NoError(t, err) + + // The replica's own UUID entries should be reported as errant because the + // real shard primary does not include them. + require.Equal(t, replicaUUID+":1-5", replicaInstance.GtidErrant) +} + // TestPrimaryErrantGTIDs tests that we don't run Errant GTID detection on the primary tablet itself! func TestPrimaryErrantGTIDs(t *testing.T) { // Clear the database after the test. The easiest way to do that is to run all the initialization commands again.