diff --git a/go/vt/discovery/fake_healthcheck.go b/go/vt/discovery/fake_healthcheck.go index 345ea4a5351..f4346b40f2e 100644 --- a/go/vt/discovery/fake_healthcheck.go +++ b/go/vt/discovery/fake_healthcheck.go @@ -21,6 +21,7 @@ import ( "sync" "vitess.io/vitess/go/vt/topo" + "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/vttablet/queryservice" "vitess.io/vitess/go/vt/vttablet/sandboxconn" @@ -104,6 +105,17 @@ func (fhc *FakeHealthCheck) RemoveTablet(tablet *topodatapb.Tablet) { fhc.mu.Lock() defer fhc.mu.Unlock() key := TabletToMapKey(tablet) + item, ok := fhc.items[key] + if !ok { + return + } + // Make sure the key still corresponds to the tablet we want to delete. + // If it doesn't match, we should do nothing. The tablet we were asked to + // delete is already gone, and some other tablet is using the key + // (host:port) that the original tablet used to use, which is fine. + if !topoproto.TabletAliasEqual(tablet.Alias, item.ts.Tablet.Alias) { + return + } delete(fhc.items, key) } diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 2e5e527b9f8..fbc322b8e1b 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -688,6 +688,17 @@ func (hc *HealthCheckImpl) deleteConn(tablet *topodatapb.Tablet) { if !ok { return } + // Make sure the key still corresponds to the tablet we want to delete. + // If it doesn't match, we should do nothing. The tablet we were asked to + // delete is already gone, and some other tablet is using the key + // (host:port) that the original tablet used to use, which is fine. + if !topoproto.TabletAliasEqual(tablet.Alias, th.latestTabletStats.Tablet.Alias) { + return + } + hc.deleteConnLocked(key, th) +} + +func (hc *HealthCheckImpl) deleteConnLocked(key string, th *tabletHealth) { th.latestTabletStats.Up = false th.cancelFunc() delete(hc.addrToHealth, key) @@ -733,10 +744,18 @@ func (hc *HealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) { hc.mu.Unlock() return } - if _, ok := hc.addrToHealth[key]; ok { - hc.mu.Unlock() - log.Warningf("adding duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet) - return + if th, ok := hc.addrToHealth[key]; ok { + // Something already exists at this key. + // If it's the same tablet, something is wrong. + if topoproto.TabletAliasEqual(th.latestTabletStats.Tablet.Alias, tablet.Alias) { + hc.mu.Unlock() + log.Warningf("refusing to add duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet) + return + } + // If it's a different tablet, then we trust this new tablet that claims + // it has taken over the host:port that the old tablet used to be on. + // Remove the old tablet to clear the way. + hc.deleteConnLocked(key, th) } hc.addrToHealth[key] = &tabletHealth{ cancelFunc: cancelFunc, @@ -752,15 +771,13 @@ func (hc *HealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) { // RemoveTablet removes the tablet, and stops the health check. // It does not block. func (hc *HealthCheckImpl) RemoveTablet(tablet *topodatapb.Tablet) { - go hc.deleteConn(tablet) + hc.deleteConn(tablet) } // ReplaceTablet removes the old tablet and adds the new tablet. func (hc *HealthCheckImpl) ReplaceTablet(old, new *topodatapb.Tablet, name string) { - go func() { - hc.deleteConn(old) - hc.AddTablet(new, name) - }() + hc.deleteConn(old) + hc.AddTablet(new, name) } // WaitForInitialStatsUpdates waits until all tablets added via AddTablet() call diff --git a/go/vt/discovery/topology_watcher_test.go b/go/vt/discovery/topology_watcher_test.go index 9288cca1bed..315160a5946 100644 --- a/go/vt/discovery/topology_watcher_test.go +++ b/go/vt/discovery/topology_watcher_test.go @@ -234,6 +234,56 @@ func checkWatcher(t *testing.T, cellTablets, refreshKnownTablets bool) { t.Errorf("fhc.GetAllTablets() = %+v; want %v => %+v", allTablets, key, tablet2) } + // Both tablets restart on different hosts. + // tablet2 happens to land on the host:port that tablet 1 used to be on. + // This can only be tested when we refresh known tablets. + if refreshKnownTablets { + origTablet := *tablet + origTablet2 := *tablet2 + + if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error { + t.Hostname = tablet.Hostname + t.PortMap = tablet.PortMap + tablet2 = t + return nil + }); err != nil { + t.Fatalf("UpdateTabletFields failed: %v", err) + } + if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error { + t.Hostname = "host3" + tablet = t + return nil + }); err != nil { + t.Fatalf("UpdateTabletFields failed: %v", err) + } + tw.loadTablets() + counts = checkOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2}) + allTablets = fhc.GetAllTablets() + key2 := TabletToMapKey(tablet2) + if _, ok := allTablets[key2]; !ok { + t.Fatalf("tablet was lost because it's reusing an address recently used by another tablet: %v", key2) + } + + // Change tablets back to avoid altering later tests. + if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error { + t.Hostname = origTablet2.Hostname + t.PortMap = origTablet2.PortMap + tablet2 = t + return nil + }); err != nil { + t.Fatalf("UpdateTabletFields failed: %v", err) + } + if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error { + t.Hostname = origTablet.Hostname + tablet = t + return nil + }); err != nil { + t.Fatalf("UpdateTabletFields failed: %v", err) + } + tw.loadTablets() + counts = checkOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2}) + } + // Remove the tablet and check that it is detected as being gone. if err := ts.DeleteTablet(context.Background(), tablet.Alias); err != nil { t.Fatalf("DeleteTablet failed: %v", err)