Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions go/vt/discovery/fake_healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"sync"

"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vttablet/queryservice"
"vitess.io/vitess/go/vt/vttablet/sandboxconn"

Expand Down Expand Up @@ -104,6 +105,17 @@ func (fhc *FakeHealthCheck) RemoveTablet(tablet *topodatapb.Tablet) {
fhc.mu.Lock()
defer fhc.mu.Unlock()
key := TabletToMapKey(tablet)
item, ok := fhc.items[key]
if !ok {
return
}
// Make sure the key still corresponds to the tablet we want to delete.
// If it doesn't match, we should do nothing. The tablet we were asked to
// delete is already gone, and some other tablet is using the key
// (host:port) that the original tablet used to use, which is fine.
if !topoproto.TabletAliasEqual(tablet.Alias, item.ts.Tablet.Alias) {
return
}
delete(fhc.items, key)
}

Expand Down
35 changes: 26 additions & 9 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,17 @@ func (hc *HealthCheckImpl) deleteConn(tablet *topodatapb.Tablet) {
if !ok {
return
}
// Make sure the key still corresponds to the tablet we want to delete.
// If it doesn't match, we should do nothing. The tablet we were asked to
// delete is already gone, and some other tablet is using the key
// (host:port) that the original tablet used to use, which is fine.
if !topoproto.TabletAliasEqual(tablet.Alias, th.latestTabletStats.Tablet.Alias) {
return
}
hc.deleteConnLocked(key, th)
}

func (hc *HealthCheckImpl) deleteConnLocked(key string, th *tabletHealth) {
th.latestTabletStats.Up = false
th.cancelFunc()
delete(hc.addrToHealth, key)
Expand Down Expand Up @@ -733,10 +744,18 @@ func (hc *HealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) {
hc.mu.Unlock()
return
}
if _, ok := hc.addrToHealth[key]; ok {
hc.mu.Unlock()
log.Warningf("adding duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet)
return
if th, ok := hc.addrToHealth[key]; ok {
// Something already exists at this key.
// If it's the same tablet, something is wrong.
if topoproto.TabletAliasEqual(th.latestTabletStats.Tablet.Alias, tablet.Alias) {
hc.mu.Unlock()
log.Warningf("refusing to add duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should eventually become an acceptable state, which will allow us to "update if needed" kind of calls. But let the warning remain for now. It will allow us to find out if this really happens in the wild.

The other future improvement on this will be to add a timestamp that tracks when a tablet acquired the address to declare the true winner. Similar to how we're solving the mastership story.

return
}
// If it's a different tablet, then we trust this new tablet that claims
// it has taken over the host:port that the old tablet used to be on.
// Remove the old tablet to clear the way.
hc.deleteConnLocked(key, th)
}
hc.addrToHealth[key] = &tabletHealth{
cancelFunc: cancelFunc,
Expand All @@ -752,15 +771,13 @@ func (hc *HealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) {
// RemoveTablet removes the tablet, and stops the health check.
// It does not block.
func (hc *HealthCheckImpl) RemoveTablet(tablet *topodatapb.Tablet) {
go hc.deleteConn(tablet)
hc.deleteConn(tablet)
}

// ReplaceTablet removes the old tablet and adds the new tablet.
func (hc *HealthCheckImpl) ReplaceTablet(old, new *topodatapb.Tablet, name string) {
go func() {
hc.deleteConn(old)
hc.AddTablet(new, name)
}()
hc.deleteConn(old)
hc.AddTablet(new, name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice yeah this should hopefully eliminate a huge source of threading issues

}

// WaitForInitialStatsUpdates waits until all tablets added via AddTablet() call
Expand Down
50 changes: 50 additions & 0 deletions go/vt/discovery/topology_watcher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,56 @@ func checkWatcher(t *testing.T, cellTablets, refreshKnownTablets bool) {
t.Errorf("fhc.GetAllTablets() = %+v; want %v => %+v", allTablets, key, tablet2)
}

// Both tablets restart on different hosts.
// tablet2 happens to land on the host:port that tablet 1 used to be on.
// This can only be tested when we refresh known tablets.
if refreshKnownTablets {
origTablet := *tablet
origTablet2 := *tablet2

if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error {
t.Hostname = tablet.Hostname
t.PortMap = tablet.PortMap
tablet2 = t
return nil
}); err != nil {
t.Fatalf("UpdateTabletFields failed: %v", err)
}
if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error {
t.Hostname = "host3"
tablet = t
return nil
}); err != nil {
t.Fatalf("UpdateTabletFields failed: %v", err)
}
tw.loadTablets()
counts = checkOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2})
allTablets = fhc.GetAllTablets()
key2 := TabletToMapKey(tablet2)
if _, ok := allTablets[key2]; !ok {
t.Fatalf("tablet was lost because it's reusing an address recently used by another tablet: %v", key2)
}

// Change tablets back to avoid altering later tests.
if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error {
t.Hostname = origTablet2.Hostname
t.PortMap = origTablet2.PortMap
tablet2 = t
return nil
}); err != nil {
t.Fatalf("UpdateTabletFields failed: %v", err)
}
if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error {
t.Hostname = origTablet.Hostname
tablet = t
return nil
}); err != nil {
t.Fatalf("UpdateTabletFields failed: %v", err)
}
tw.loadTablets()
counts = checkOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2})
}

// Remove the tablet and check that it is detected as being gone.
if err := ts.DeleteTablet(context.Background(), tablet.Alias); err != nil {
t.Fatalf("DeleteTablet failed: %v", err)
Expand Down