diff --git a/go/vt/discovery/fake_legacy_healthcheck.go b/go/vt/discovery/fake_legacy_healthcheck.go
deleted file mode 100644
index b6b854b5aba..00000000000
--- a/go/vt/discovery/fake_legacy_healthcheck.go
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "sort"
- "sync"
- "time"
-
- "vitess.io/vitess/go/vt/logutil"
-
- "vitess.io/vitess/go/vt/topo"
- "vitess.io/vitess/go/vt/topo/topoproto"
- "vitess.io/vitess/go/vt/vttablet/queryservice"
- "vitess.io/vitess/go/vt/vttablet/sandboxconn"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-// This file contains the definitions for a FakeLegacyHealthCheck class to
-// simulate a LegacyHealthCheck module. Note it is not in a sub-package because
-// otherwise it couldn't be used in this package's tests because of
-// circular dependencies.
-
-// NewFakeLegacyHealthCheck returns the fake healthcheck object.
-func NewFakeLegacyHealthCheck() *FakeLegacyHealthCheck {
- return &FakeLegacyHealthCheck{
- items: make(map[string]*flhcItem),
- }
-}
-
-// FakeLegacyHealthCheck implements discovery.LegacyHealthCheck.
-type FakeLegacyHealthCheck struct {
- listener LegacyHealthCheckStatsListener
-
- // mu protects the items map
- mu sync.RWMutex
- items map[string]*flhcItem
-}
-
-type flhcItem struct {
- ts *LegacyTabletStats
- conn queryservice.QueryService
-}
-
-//
-// discovery.LegacyHealthCheck interface methods
-//
-
-// RegisterStats is not implemented.
-func (fhc *FakeLegacyHealthCheck) RegisterStats() {
-}
-
-// SetListener is not implemented.
-func (fhc *FakeLegacyHealthCheck) SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool) {
- fhc.listener = listener
-}
-
-// WaitForInitialStatsUpdates is not implemented.
-func (fhc *FakeLegacyHealthCheck) WaitForInitialStatsUpdates() {
-}
-
-// AddTablet adds the tablet and calls the listener.
-func (fhc *FakeLegacyHealthCheck) AddTablet(tablet *topodatapb.Tablet, name string) {
- key := TabletToMapKey(tablet)
- item := &flhcItem{
- ts: &LegacyTabletStats{
- Key: key,
- Tablet: tablet,
- Target: &querypb.Target{
- Keyspace: tablet.Keyspace,
- Shard: tablet.Shard,
- TabletType: tablet.Type,
- },
- Serving: true,
- Up: true,
- Name: name,
- Stats: &querypb.RealtimeStats{},
- },
- }
-
- fhc.mu.Lock()
- defer fhc.mu.Unlock()
- fhc.items[key] = item
-
- if fhc.listener != nil {
- fhc.listener.StatsUpdate(item.ts)
- }
-}
-
-// RemoveTablet removes the tablet.
-func (fhc *FakeLegacyHealthCheck) RemoveTablet(tablet *topodatapb.Tablet) {
- fhc.mu.Lock()
- defer fhc.mu.Unlock()
- key := TabletToMapKey(tablet)
- item, ok := fhc.items[key]
- if !ok {
- return
- }
- // Make sure the key still corresponds to the tablet we want to delete.
- // If it doesn't match, we should do nothing. The tablet we were asked to
- // delete is already gone, and some other tablet is using the key
- // (host:port) that the original tablet used to use, which is fine.
- if !topoproto.TabletAliasEqual(tablet.Alias, item.ts.Tablet.Alias) {
- return
- }
- delete(fhc.items, key)
-}
-
-// ReplaceTablet removes the old tablet and adds the new.
-func (fhc *FakeLegacyHealthCheck) ReplaceTablet(old, new *topodatapb.Tablet, name string) {
- fhc.RemoveTablet(old)
- fhc.AddTablet(new, name)
-}
-
-// GetConnection returns the TabletConn of the given tablet.
-func (fhc *FakeLegacyHealthCheck) GetConnection(key string) queryservice.QueryService {
- fhc.mu.RLock()
- defer fhc.mu.RUnlock()
- if item := fhc.items[key]; item != nil {
- return item.conn
- }
- return nil
-}
-
-// CacheStatus returns the status for each tablet
-func (fhc *FakeLegacyHealthCheck) CacheStatus() LegacyTabletsCacheStatusList {
- fhc.mu.Lock()
- defer fhc.mu.Unlock()
-
- stats := make(LegacyTabletsCacheStatusList, 0, len(fhc.items))
- for _, item := range fhc.items {
- stats = append(stats, &LegacyTabletsCacheStatus{
- Cell: "FakeCell",
- Target: item.ts.Target,
- TabletsStats: LegacyTabletStatsList{item.ts},
- })
- }
- sort.Sort(stats)
- return stats
-}
-
-// Close is not implemented.
-func (fhc *FakeLegacyHealthCheck) Close() error {
- return nil
-}
-
-//
-// Management methods
-//
-
-// Reset cleans up the internal state.
-func (fhc *FakeLegacyHealthCheck) Reset() {
- fhc.mu.Lock()
- defer fhc.mu.Unlock()
-
- fhc.items = make(map[string]*flhcItem)
-}
-
-// AddFakeTablet inserts a fake entry into FakeLegacyHealthCheck.
-// The Tablet can be talked to using the provided connection.
-// The Listener is called, as if AddTablet had been called.
-// For flexibility the connection is created via a connFactory callback
-func (fhc *FakeLegacyHealthCheck) AddFakeTablet(cell, host string, port int32, keyspace, shard string, tabletType topodatapb.TabletType, serving bool, reparentTS int64, err error, connFactory func(*topodatapb.Tablet) queryservice.QueryService) queryservice.QueryService {
- t := topo.NewTablet(0, cell, host)
- t.Keyspace = keyspace
- t.Shard = shard
- t.Type = tabletType
- t.PortMap["vt"] = port
- // reparentTS only has precision to seconds
- t.PrimaryTermStartTime = logutil.TimeToProto(time.Unix(reparentTS, 0))
- key := TabletToMapKey(t)
-
- fhc.mu.Lock()
- defer fhc.mu.Unlock()
- item := fhc.items[key]
- if item == nil {
- item = &flhcItem{
- ts: &LegacyTabletStats{
- Key: key,
- Tablet: t,
- Up: true,
- },
- }
- fhc.items[key] = item
- }
- item.ts.Target = &querypb.Target{
- Keyspace: keyspace,
- Shard: shard,
- TabletType: tabletType,
- }
- item.ts.Serving = serving
- item.ts.TabletExternallyReparentedTimestamp = reparentTS
- item.ts.Stats = &querypb.RealtimeStats{}
- item.ts.LastError = err
- conn := connFactory(t)
- item.conn = conn
-
- if fhc.listener != nil {
- fhc.listener.StatsUpdate(item.ts)
- }
- return conn
-}
-
-// AddTestTablet adds a fake tablet for tests using the SandboxConn and returns
-// the fake connection
-func (fhc *FakeLegacyHealthCheck) AddTestTablet(cell, host string, port int32, keyspace, shard string, tabletType topodatapb.TabletType, serving bool, reparentTS int64, err error) *sandboxconn.SandboxConn {
- conn := fhc.AddFakeTablet(cell, host, port, keyspace, shard, tabletType, serving, reparentTS, err, func(tablet *topodatapb.Tablet) queryservice.QueryService {
- return sandboxconn.NewSandboxConn(tablet)
- })
- return conn.(*sandboxconn.SandboxConn)
-}
-
-// GetAllTablets returns all the tablets we have.
-func (fhc *FakeLegacyHealthCheck) GetAllTablets() map[string]*topodatapb.Tablet {
- res := make(map[string]*topodatapb.Tablet)
- fhc.mu.RLock()
- defer fhc.mu.RUnlock()
- for key, t := range fhc.items {
- res[key] = t.ts.Tablet
- }
- return res
-}
diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go
index c724ddb2870..65bfa5aafc5 100644
--- a/go/vt/discovery/healthcheck.go
+++ b/go/vt/discovery/healthcheck.go
@@ -46,6 +46,7 @@ import (
"time"
"vitess.io/vitess/go/flagutil"
+ "vitess.io/vitess/go/netutil"
"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/proto/query"
@@ -81,12 +82,15 @@ var (
refreshKnownTablets = flag.Bool("tablet_refresh_known_tablets", true, "tablet refresh reloads the tablet address/port map from topo in case it changes")
// topoReadConcurrency tells us how many topo reads are allowed in parallel
topoReadConcurrency = flag.Int("topo_read_concurrency", 32, "concurrent topo reads")
+
+ // How much to sleep between each check.
+ waitAvailableTabletInterval = 100 * time.Millisecond
)
// See the documentation for NewHealthCheck below for an explanation of these parameters.
const (
- defaultHealthCheckRetryDelay = 5 * time.Second
- defaultHealthCheckTimeout = 1 * time.Minute
+ DefaultHealthCheckRetryDelay = 5 * time.Second
+ DefaultHealthCheckTimeout = 1 * time.Minute
// DefaultTopoReadConcurrency is used as the default value for the topoReadConcurrency parameter of a TopologyWatcher.
DefaultTopoReadConcurrency int = 5
@@ -150,22 +154,20 @@ func FilteringKeyspaces() bool {
return len(KeyspacesToWatch) > 0
}
-// TabletRecorder is a sub interface of HealthCheck.
-// It is separated out to enable unit testing.
-type TabletRecorder interface {
+type KeyspaceShardTabletType string
+type tabletAliasString string
+
+// HealthCheck declares what the TabletGateway needs from the HealthCheck
+type HealthCheck interface {
// AddTablet adds the tablet.
AddTablet(tablet *topodata.Tablet)
+
// RemoveTablet removes the tablet.
RemoveTablet(tablet *topodata.Tablet)
+
// ReplaceTablet does an AddTablet and RemoveTablet in one call, effectively replacing the old tablet with the new.
ReplaceTablet(old, new *topodata.Tablet)
-}
-type KeyspaceShardTabletType string
-type tabletAliasString string
-
-// HealthCheck declares what the TabletGateway needs from the HealthCheck
-type HealthCheck interface {
// CacheStatus returns a displayable version of the health check cache.
CacheStatus() TabletsCacheStatusList
@@ -619,17 +621,14 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet
var result []*TabletHealth
hc.mu.Lock()
defer hc.mu.Unlock()
- if target.Shard == "" {
- target.Shard = "0"
- }
return append(result, hc.healthy[KeyFromTarget(target)]...)
}
-// getTabletStats returns all tablets for the given target.
+// GetTabletStats returns all tablets for the given target.
// The returned array is owned by the caller.
// For TabletType_PRIMARY, this will only return at most one entry,
// the most recent tablet of type primary.
-func (hc *HealthCheckImpl) getTabletStats(target *query.Target) []*TabletHealth {
+func (hc *HealthCheckImpl) GetTabletStats(target *query.Target) []*TabletHealth {
var result []*TabletHealth
hc.mu.Lock()
defer hc.mu.Unlock()
@@ -698,7 +697,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query.
if requireServing {
tabletHealths = hc.GetHealthyTabletStats(target)
} else {
- tabletHealths = hc.getTabletStats(target)
+ tabletHealths = hc.GetTabletStats(target)
}
if len(tabletHealths) == 0 {
allPresent = false
@@ -899,3 +898,15 @@ func (hc *HealthCheckImpl) stateChecksum() int64 {
return int64(crc32.ChecksumIEEE(buf.Bytes()))
}
+
+// TabletToMapKey creates a key to the map from tablet's host and ports.
+// It should only be used in discovery and related module.
+func TabletToMapKey(tablet *topodata.Tablet) string {
+ parts := make([]string, 0, 1)
+ for name, port := range tablet.PortMap {
+ parts = append(parts, netutil.JoinHostPort(name, port))
+ }
+ sort.Strings(parts)
+ parts = append([]string{tablet.Hostname}, parts...)
+ return strings.Join(parts, ",")
+}
diff --git a/go/vt/discovery/healthcheck_test.go b/go/vt/discovery/healthcheck_test.go
index d2f6d25eaf9..8606eb85bd7 100644
--- a/go/vt/discovery/healthcheck_test.go
+++ b/go/vt/discovery/healthcheck_test.go
@@ -51,13 +51,22 @@ import (
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)
-var connMap map[string]*fakeConn
-var connMapMu sync.Mutex
+var (
+ connMap map[string]*fakeConn
+ connMapMu sync.Mutex
+)
+
+func testChecksum(t *testing.T, want, got int64) {
+ t.Helper()
+ if want != got {
+ t.Errorf("want checksum %v, got %v", want, got)
+ }
+}
func init() {
tabletconn.RegisterDialer("fake_gateway", tabletDialer)
- //log error
+ // log error
if err := flag.Set("tablet_protocol", "fake_gateway"); err != nil {
log.Errorf("failed to set flag \"tablet_protocol\" to \"fake_gateway\":%v", err)
}
@@ -196,7 +205,7 @@ func TestHealthCheck(t *testing.T) {
}
input <- shr
result = <-resultChan
- //TODO: figure out how to compare objects that contain errors using utils.MustMatch
+ // TODO: figure out how to compare objects that contain errors using utils.MustMatch
assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result)
testChecksum(t, 1027934207, hc.stateChecksum()) // unchanged
@@ -257,7 +266,7 @@ func TestHealthCheckStreamError(t *testing.T) {
LastError: fmt.Errorf("some stream error"),
}
result = <-resultChan
- //TODO: figure out how to compare objects that contain errors using utils.MustMatch
+ // TODO: figure out how to compare objects that contain errors using utils.MustMatch
assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result)
// tablet should be removed from healthy list
a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
@@ -317,7 +326,7 @@ func TestHealthCheckErrorOnPrimary(t *testing.T) {
LastError: fmt.Errorf("some stream error"),
}
result = <-resultChan
- //TODO: figure out how to compare objects that contain errors using utils.MustMatch
+ // TODO: figure out how to compare objects that contain errors using utils.MustMatch
assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result)
// tablet should be removed from healthy list
a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
@@ -1158,7 +1167,7 @@ func TestTemplate(t *testing.T) {
}
func TestDebugURLFormatting(t *testing.T) {
- //log error
+ // log error
if err2 := flag.Set("tablet_url_template", "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp"); err2 != nil {
log.Errorf("flag.Set(\"tablet_url_template\", \"https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp\") failed : %v", err2)
}
diff --git a/go/vt/discovery/legacy_healthcheck.go b/go/vt/discovery/legacy_healthcheck.go
deleted file mode 100644
index 3154736f60d..00000000000
--- a/go/vt/discovery/legacy_healthcheck.go
+++ /dev/null
@@ -1,974 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// Package discovery provides a way to discover all tablets e.g. within a
-// specific shard and monitor their current health.
-// Deprecated
-// Use the LegacyHealthCheck object to query for tablets and their health.
-//
-// For an example how to use the LegacyHealthCheck object, see worker/topo_utils.go.
-//
-// Tablets have to be manually added to the LegacyHealthCheck using AddTablet().
-// Alternatively, use a Watcher implementation which will constantly watch
-// a source (e.g. the topology) and add and remove tablets as they are
-// added or removed from the source.
-// For a Watcher example have a look at NewLegacyShardReplicationWatcher().
-//
-// Each LegacyHealthCheck has a LegacyHealthCheckStatsListener that will receive
-// notification of when tablets go up and down.
-// LegacyTabletStatsCache is one implementation, that caches the known tablets
-// and the healthy ones per keyspace/shard/tabletType.
-//
-// Internally, the LegacyHealthCheck module is connected to each tablet and has a
-// streaming RPC (StreamHealth) open to receive periodic health infos.
-package discovery
-
-import (
- "bytes"
- "context"
- "encoding/json"
- "fmt"
- "hash/crc32"
- "html/template"
- "net/http"
- "sort"
- "strings"
- "sync"
- "time"
-
- "google.golang.org/protobuf/proto"
-
- "vitess.io/vitess/go/netutil"
- "vitess.io/vitess/go/stats"
- "vitess.io/vitess/go/sync2"
- "vitess.io/vitess/go/vt/grpcclient"
- "vitess.io/vitess/go/vt/log"
- "vitess.io/vitess/go/vt/servenv"
- "vitess.io/vitess/go/vt/topo/topoproto"
- "vitess.io/vitess/go/vt/topotools"
- "vitess.io/vitess/go/vt/vttablet/queryservice"
- "vitess.io/vitess/go/vt/vttablet/tabletconn"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-const (
- // LegacyHealthCheckTemplate is the HTML code to display a TabletsCacheStatusList
- LegacyHealthCheckTemplate = `
-
-
-
- | HealthCheck Tablet Cache |
-
-
- | Cell |
- Keyspace |
- Shard |
- TabletType |
- tabletStats |
-
- {{range $i, $ts := .}}
-
- | {{github_com_vitessio_vitess_vtctld_srv_cell $ts.Cell}} |
- {{github_com_vitessio_vitess_vtctld_srv_keyspace $ts.Cell $ts.Target.Keyspace}} |
- {{$ts.Target.Shard}} |
- {{$ts.Target.TabletType}} |
- {{$ts.StatusAsHTML}} |
-
- {{end}}
-
-`
-)
-
-func init() {
- // Flags are not parsed at this point and the default value of the flag (just the hostname) will be used.
- ParseTabletURLTemplateFromFlag()
-}
-
-// LegacyHealthCheckStatsListener is the listener to receive health check stats update.
-type LegacyHealthCheckStatsListener interface {
- // StatsUpdate is called when:
- // - a new tablet is known to the LegacyHealthCheck, and its first
- // streaming healthcheck is returned. (then ts.Up is true).
- // - a tablet is removed from the list of tablets we watch
- // (then ts.Up is false).
- // - a tablet dynamically changes its type. When registering the
- // listener, if sendDownEvents is true, two events are generated
- // (ts.Up false on the old type, ts.Up true on the new type).
- // If it is false, only one event is sent (ts.Up true on the new
- // type).
- StatsUpdate(*LegacyTabletStats)
-}
-
-// LegacyTabletStats is returned when getting the set of tablets.
-type LegacyTabletStats struct {
- // Key uniquely identifies that serving tablet. It is computed
- // from the Tablet's record Hostname and PortMap. If a tablet
- // is restarted on different ports, its Key will be different.
- // Key is computed using the TabletToMapKey method below.
- // key can be used in GetConnection().
- Key string
- // Tablet is the tablet object that was sent to LegacyHealthCheck.AddTablet.
- Tablet *topodatapb.Tablet
- // Name is an optional tag (e.g. alternative address) for the
- // tablet. It is supposed to represent the tablet as a task,
- // not as a process. For instance, it can be a
- // cell+keyspace+shard+tabletType+taskIndex value.
- Name string
- // Target is the current target as returned by the streaming
- // StreamHealth RPC.
- Target *querypb.Target
- // Up describes whether the tablet is added or removed.
- Up bool
- // Serving describes if the tablet can be serving traffic.
- Serving bool
- // TabletExternallyReparentedTimestamp is the last timestamp
- // that this tablet was either elected the primary, or received
- // a TabletExternallyReparented event. It is set to 0 if the
- // tablet doesn't think it's a primary.
- TabletExternallyReparentedTimestamp int64
- // Stats is the current health status, as received by the
- // StreamHealth RPC (replication lag, ...).
- Stats *querypb.RealtimeStats
- // LastError is the error we last saw when trying to get the
- // tablet's healthcheck.
- LastError error
-}
-
-// String is defined because we want to print a []*LegacyTabletStats array nicely.
-func (e *LegacyTabletStats) String() string {
- return fmt.Sprint(*e)
-}
-
-// DeepEqual compares two LegacyTabletStats. Since we include protos, we
-// need to use proto.Equal on these.
-func (e *LegacyTabletStats) DeepEqual(f *LegacyTabletStats) bool {
- return e.Key == f.Key &&
- proto.Equal(e.Tablet, f.Tablet) &&
- e.Name == f.Name &&
- proto.Equal(e.Target, f.Target) &&
- e.Up == f.Up &&
- e.Serving == f.Serving &&
- e.TabletExternallyReparentedTimestamp == f.TabletExternallyReparentedTimestamp &&
- proto.Equal(e.Stats, f.Stats) &&
- ((e.LastError == nil && f.LastError == nil) ||
- (e.LastError != nil && f.LastError != nil && e.LastError.Error() == f.LastError.Error()))
-}
-
-// Copy produces a copy of LegacyTabletStats.
-func (e *LegacyTabletStats) Copy() *LegacyTabletStats {
- ts := *e
- return &ts
-}
-
-// GetTabletHostPort formats a tablet host port address.
-func (e LegacyTabletStats) GetTabletHostPort() string {
- vtPort := e.Tablet.PortMap["vt"]
- return netutil.JoinHostPort(e.Tablet.Hostname, vtPort)
-}
-
-// GetHostNameLevel returns the specified hostname level. If the level does not exist it will pick the closest level.
-// This seems unused but can be utilized by certain url formatting templates. See getTabletDebugURL for more details.
-func (e LegacyTabletStats) GetHostNameLevel(level int) string {
- chunkedHostname := strings.Split(e.Tablet.Hostname, ".")
-
- if level < 0 {
- return chunkedHostname[0]
- } else if level >= len(chunkedHostname) {
- return chunkedHostname[len(chunkedHostname)-1]
- } else {
- return chunkedHostname[level]
- }
-}
-
-// NamedStatusURL returns the URL for the case where a tablet server is named.
-func (e LegacyTabletStats) NamedStatusURL() string {
- return "/" + topoproto.TabletAliasString(e.Tablet.Alias) + servenv.StatusURLPath()
-}
-
-// getTabletDebugURL formats a debug url to the tablet.
-// It uses a format string that can be passed into the app to format
-// the debug URL to accommodate different network setups. It applies
-// the html/template string defined to a LegacyTabletStats object. The
-// format string can refer to members and functions of LegacyTabletStats
-// like a regular html/template string.
-//
-// For instance given a tablet with hostname:port of host.dc.domain:22
-// could be configured as follows:
-// http://{{.GetTabletHostPort}} -> http://host.dc.domain:22
-// https://{{.Tablet.Hostname}} -> https://host.dc.domain
-// https://{{.GetHostNameLevel 0}}.bastion.corp -> https://host.bastion.corp
-// {{.NamedStatusURL}} -> test-0000000001/debug/status
-func (e LegacyTabletStats) getTabletDebugURL() string {
- var buffer bytes.Buffer
-
- // Error logged
- if err := tabletURLTemplate.Execute(&buffer, e); err != nil {
- log.Errorf("tabletURLTemplate.Execute(&buffer, e) failed: %v", err)
- }
- return buffer.String()
-}
-
-// TrivialStatsUpdate returns true iff the old and new LegacyTabletStats
-// haven't changed enough to warrant re-calling FilterLegacyStatsByReplicationLag.
-func (e *LegacyTabletStats) TrivialStatsUpdate(n *LegacyTabletStats) bool {
- // Skip replag filter when replag remains in the low rep lag range,
- // which should be the case majority of the time.
- lowRepLag := lowReplicationLag.Seconds()
- oldRepLag := float64(e.Stats.ReplicationLagSeconds)
- newRepLag := float64(n.Stats.ReplicationLagSeconds)
- if oldRepLag <= lowRepLag && newRepLag <= lowRepLag {
- return true
- }
-
- // Skip replag filter when replag remains in the high rep lag range,
- // and did not change beyond +/- 10%.
- // when there is a high rep lag, it takes a long time for it to reduce,
- // so it is not necessary to re-calculate every time.
- // In that case, we won't save the new record, so we still
- // remember the original replication lag.
- if oldRepLag > lowRepLag && newRepLag > lowRepLag && newRepLag < oldRepLag*1.1 && newRepLag > oldRepLag*0.9 {
- return true
- }
-
- return false
-}
-
-// LegacyTabletRecorder is the part of the LegacyHealthCheck interface that can
-// add or remove tablets. We define it as a sub-interface here so we
-// can add filters on tablets if needed.
-type LegacyTabletRecorder interface {
- // AddTablet adds the tablet.
- // Name is an alternate name, like an address.
- AddTablet(tablet *topodatapb.Tablet, name string)
-
- // RemoveTablet removes the tablet.
- RemoveTablet(tablet *topodatapb.Tablet)
-
- // ReplaceTablet does an AddTablet and RemoveTablet in one call, effectively replacing the old tablet with the new.
- ReplaceTablet(old, new *topodatapb.Tablet, name string)
-}
-
-// LegacyHealthCheck defines the interface of health checking module.
-// The goal of this object is to maintain a StreamHealth RPC
-// to a lot of tablets. Tablets are added / removed by calling the
-// AddTablet / RemoveTablet methods (other discovery module objects
-// can for instance watch the topology and call these).
-//
-// Updates to the health of all registered tablet can be watched by
-// registering a listener. To get the underlying "TabletConn" object
-// which is used for each tablet, use the "GetConnection()" method
-// below and pass in the Key string which is also sent to the
-// listener in each update (as it is part of LegacyTabletStats).
-type LegacyHealthCheck interface {
- // LegacyTabletRecorder interface adds AddTablet and RemoveTablet methods.
- // AddTablet adds the tablet, and starts health check on it.
- // RemoveTablet removes the tablet, and stops its StreamHealth RPC.
- LegacyTabletRecorder
-
- // RegisterStats registers the connection counts and checksum stats.
- // It can only be called on one Healthcheck object per process.
- RegisterStats()
- // SetListener sets the listener for healthcheck
- // updates. sendDownEvents is used when a tablet changes type
- // (from replica to primary for instance). If the listener
- // wants two events (Up=false on old type, Up=True on new
- // type), sendDownEvents should be set. Otherwise, the
- // healthcheck will only send one event (Up=true on new type).
- //
- // Note that the default implementation requires to set the
- // listener before any tablets are added to the healthcheck.
- SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool)
- // WaitForInitialStatsUpdates waits until all tablets added via
- // AddTablet() call were propagated to the listener via correspondingdiscovert
- // StatsUpdate() calls. Note that code path from AddTablet() to
- // corresponding StatsUpdate() is asynchronous but not cancelable, thus
- // this function is also non-cancelable and can't return error. Also
- // note that all AddTablet() calls should happen before calling this
- // method. WaitForInitialStatsUpdates won't wait for StatsUpdate() calls
- // corresponding to AddTablet() calls made during its execution.
- WaitForInitialStatsUpdates()
- // GetConnection returns the TabletConn of the given tablet.
- GetConnection(key string) queryservice.QueryService
- // CacheStatus returns a displayable version of the cache.
- CacheStatus() LegacyTabletsCacheStatusList
- // Close stops the healthcheck.
- Close() error
-}
-
-// LegacyHealthCheckImpl performs health checking and notifies downstream components about any changes.
-// It contains a map of legacyTabletHealth objects, each of which stores the health information for
-// a tablet. A checkConn goroutine is spawned for each legacyTabletHealth, which is responsible for
-// keeping that legacyTabletHealth up-to-date. This is done through callbacks to updateHealth.
-// If checkConn terminates for any reason, it updates legacyTabletHealth.Up as false. If a legacyTabletHealth
-// gets removed from the map, its cancelFunc gets called, which ensures that the associated
-// checkConn goroutine eventually terminates.
-type LegacyHealthCheckImpl struct {
- // Immutable fields set at construction time.
- listener LegacyHealthCheckStatsListener
- sendDownEvents bool
- retryDelay time.Duration
- healthCheckTimeout time.Duration
- // connsWG keeps track of all launched Go routines that monitor tablet connections.
- connsWG sync.WaitGroup
-
- // mu protects all the following fields.
- mu sync.Mutex
-
- // addrToHealth maps from address to legacyTabletHealth.
- addrToHealth map[string]*legacyTabletHealth
-
- // Wait group that's used to wait until all initial StatsUpdate() calls are made after the AddTablet() calls.
- initialUpdatesWG sync.WaitGroup
-}
-
-// legacyHealthCheckConn is a structure that lives within the scope of
-// the checkConn goroutine to maintain its internal state. Therefore,
-// it does not require synchronization. Changes that are relevant to
-// healthcheck are transmitted through calls to LegacyHealthCheckImpl.updateHealth.
-// TODO(sougou): move this and associated functions to a separate file.
-type legacyHealthCheckConn struct {
- ctx context.Context
-
- conn queryservice.QueryService
- tabletStats LegacyTabletStats
- loggedServingState bool
- lastResponseTimestamp time.Time // timestamp of the last healthcheck response
-}
-
-// legacyTabletHealth maintains the health status of a tablet. A map of this
-// structure is maintained in LegacyHealthCheckImpl.
-type legacyTabletHealth struct {
- // cancelFunc must be called before discarding legacyTabletHealth.
- // This will ensure that the associated checkConn goroutine will terminate.
- cancelFunc context.CancelFunc
- // conn is the connection associated with the tablet.
- conn queryservice.QueryService
- // latestTabletStats stores the latest health stats of the tablet.
- latestTabletStats LegacyTabletStats
-}
-
-// NewLegacyDefaultHealthCheck creates a new LegacyHealthCheck object with a default configuration.
-func NewLegacyDefaultHealthCheck() LegacyHealthCheck {
- return NewLegacyHealthCheck(defaultHealthCheckRetryDelay, defaultHealthCheckTimeout)
-}
-
-// NewLegacyHealthCheck creates a new LegacyHealthCheck object.
-// Parameters:
-// retryDelay.
-// The duration to wait before retrying to connect (e.g. after a failed connection
-// attempt).
-// healthCheckTimeout.
-// The duration for which we consider a health check response to be 'fresh'. If we don't get
-// a health check response from a tablet for more than this duration, we consider the tablet
-// not healthy.
-func NewLegacyHealthCheck(retryDelay, healthCheckTimeout time.Duration) LegacyHealthCheck {
- hc := &LegacyHealthCheckImpl{
- addrToHealth: make(map[string]*legacyTabletHealth),
- retryDelay: retryDelay,
- healthCheckTimeout: healthCheckTimeout,
- }
-
- healthcheckOnce.Do(func() {
- http.Handle("/debug/gateway", hc)
- })
-
- return hc
-}
-
-// RegisterStats registers the connection counts stats
-func (hc *LegacyHealthCheckImpl) RegisterStats() {
- stats.NewGaugesFuncWithMultiLabels(
- "HealthcheckConnections",
- "the number of healthcheck connections registered",
- []string{"Keyspace", "ShardName", "TabletType"},
- hc.servingConnStats)
-
- stats.NewGaugeFunc(
- "HealthcheckChecksum",
- "crc32 checksum of the current healthcheck state",
- hc.stateChecksum)
-}
-
-// ServeHTTP is part of the http.Handler interface. It renders the current state of the discovery gateway tablet cache into json.
-func (hc *LegacyHealthCheckImpl) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
- w.Header().Set("Content-Type", "application/json; charset=utf-8")
- status := hc.cacheStatusMap()
- b, err := json.MarshalIndent(status, "", " ")
- if err != nil {
- w.Write([]byte(err.Error()))
- return
- }
-
- buf := bytes.NewBuffer(nil)
- json.HTMLEscape(buf, b)
- w.Write(buf.Bytes())
-}
-
-// servingConnStats returns the number of serving tablets per keyspace/shard/tablet type.
-func (hc *LegacyHealthCheckImpl) servingConnStats() map[string]int64 {
- res := make(map[string]int64)
- hc.mu.Lock()
- defer hc.mu.Unlock()
- for _, th := range hc.addrToHealth {
- if !th.latestTabletStats.Up || !th.latestTabletStats.Serving || th.latestTabletStats.LastError != nil {
- continue
- }
- key := fmt.Sprintf("%s.%s.%s", th.latestTabletStats.Target.Keyspace, th.latestTabletStats.Target.Shard, topoproto.TabletTypeLString(th.latestTabletStats.Target.TabletType))
- res[key]++
- }
- return res
-}
-
-// stateChecksum returns a crc32 checksum of the healthcheck state
-func (hc *LegacyHealthCheckImpl) stateChecksum() int64 {
- // CacheStatus is sorted so this should be stable across vtgates
- cacheStatus := hc.CacheStatus()
- var buf bytes.Buffer
- for _, st := range cacheStatus {
- fmt.Fprintf(&buf,
- "%v%v%v%v\n",
- st.Cell,
- st.Target.Keyspace,
- st.Target.Shard,
- st.Target.TabletType.String(),
- )
- sort.Sort(st.TabletsStats)
- for _, ts := range st.TabletsStats {
- fmt.Fprintf(&buf, "%v%v%v\n", ts.Up, ts.Serving, ts.TabletExternallyReparentedTimestamp)
- }
- }
-
- return int64(crc32.ChecksumIEEE(buf.Bytes()))
-}
-
-// updateHealth updates the legacyTabletHealth record and transmits the tablet stats
-// to the listener.
-func (hc *LegacyHealthCheckImpl) updateHealth(ts *LegacyTabletStats, conn queryservice.QueryService) {
- // Unconditionally send the received update at the end.
- defer func() {
- if hc.listener != nil {
- hc.listener.StatsUpdate(ts)
- }
- }()
-
- hc.mu.Lock()
- th, ok := hc.addrToHealth[ts.Key]
- if !ok {
- // This can happen on delete because the entry is removed first,
- // or if LegacyHealthCheckImpl has been closed.
- hc.mu.Unlock()
- return
- }
- oldts := th.latestTabletStats
- th.latestTabletStats = *ts
- th.conn = conn
- hc.mu.Unlock()
-
- // In the case where a tablet changes type (but not for the
- // initial message), we want to log it, and maybe advertise it too.
- if oldts.Target.TabletType != topodatapb.TabletType_UNKNOWN && oldts.Target.TabletType != ts.Target.TabletType {
- // Log and maybe notify
- log.Infof("HealthCheckUpdate(Type Change): %v, tablet: %s, target %+v => %+v, reparent time: %v",
- oldts.Name, topotools.TabletIdent(oldts.Tablet), topotools.TargetIdent(oldts.Target), topotools.TargetIdent(ts.Target), ts.TabletExternallyReparentedTimestamp)
- if hc.listener != nil && hc.sendDownEvents {
- oldts.Up = false
- hc.listener.StatsUpdate(&oldts)
- }
-
- // Track how often a tablet gets promoted to primary. It is used for
- // comparing against the variables in go/vtgate/buffer/variables.go.
- if oldts.Target.TabletType != topodatapb.TabletType_PRIMARY && ts.Target.TabletType == topodatapb.TabletType_PRIMARY {
- hcPrimaryPromotedCounters.Add([]string{ts.Target.Keyspace, ts.Target.Shard}, 1)
- }
- }
-}
-
-// finalizeConn closes the health checking connection and sends the final
-// notification about the tablet to downstream. To be called only on exit from
-// checkConn().
-func (hc *LegacyHealthCheckImpl) finalizeConn(hcc *legacyHealthCheckConn) {
- hcc.tabletStats.Up = false
- hcc.setServingState(false, "finalizeConn closing connection")
- // Note: checkConn() exits only when hcc.ctx.Done() is closed. Thus it's
- // safe to simply get Err() value here and assign to LastError.
- hcc.tabletStats.LastError = hcc.ctx.Err()
- hc.updateHealth(hcc.tabletStats.Copy(), nil)
- if hcc.conn != nil {
- // Don't use hcc.ctx because it's already closed.
- // Use a separate context, and add a timeout to prevent unbounded waits.
- ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer cancel()
- hcc.conn.Close(ctx)
- hcc.conn = nil
- }
-}
-
-// checkConn performs health checking on the given tablet.
-func (hc *LegacyHealthCheckImpl) checkConn(hcc *legacyHealthCheckConn, name string) {
- defer hc.connsWG.Done()
- defer hc.finalizeConn(hcc)
-
- // Initial notification for downstream about the tablet existence.
- hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn)
- hc.initialUpdatesWG.Done()
-
- retryDelay := hc.retryDelay
- for {
- streamCtx, streamCancel := context.WithCancel(hcc.ctx)
-
- // Setup a watcher that restarts the timer every time an update is received.
- // If a timeout occurs for a serving tablet, we make it non-serving and send
- // a status update. The stream is also terminated so it can be retried.
- // servingStatus feeds into the serving var, which keeps track of the serving
- // status transmitted by the tablet.
- servingStatus := make(chan bool, 1)
- // timedout is accessed atomically because there could be a race
- // between the goroutine that sets it and the check for its value
- // later.
- timedout := sync2.NewAtomicBool(false)
- go func() {
- for {
- select {
- case <-servingStatus:
- continue
- case <-time.After(hc.healthCheckTimeout):
- timedout.Set(true)
- streamCancel()
- return
- case <-streamCtx.Done():
- // If the stream is done, stop watching.
- return
- }
- }
- }()
-
- // Read stream health responses.
- hcc.stream(streamCtx, hc, func(shr *querypb.StreamHealthResponse) error {
- // We received a message. Reset the back-off.
- retryDelay = hc.retryDelay
- // Don't block on send to avoid deadlocks.
- select {
- case servingStatus <- shr.Serving:
- default:
- }
- return hcc.processResponse(hc, shr)
- })
-
- // streamCancel to make sure the watcher goroutine terminates.
- streamCancel()
-
- // If there was a timeout send an error. We do this after stream has returned.
- // This will ensure that this update prevails over any previous message that
- // stream could have sent.
- if timedout.Get() {
- hcc.tabletStats.LastError = fmt.Errorf("healthcheck timed out (latest %v)", hcc.lastResponseTimestamp)
- hcc.setServingState(false, hcc.tabletStats.LastError.Error())
- hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn)
- hcErrorCounters.Add([]string{hcc.tabletStats.Target.Keyspace, hcc.tabletStats.Target.Shard, topoproto.TabletTypeLString(hcc.tabletStats.Target.TabletType)}, 1)
- }
-
- // Streaming RPC failed e.g. because vttablet was restarted or took too long.
- // Sleep until the next retry is up or the context is done/canceled.
- select {
- case <-hcc.ctx.Done():
- return
- case <-time.After(retryDelay):
- // Exponentially back-off to prevent tight-loop.
- retryDelay *= 2
- // Limit the retry delay backoff to the health check timeout
- if retryDelay > hc.healthCheckTimeout {
- retryDelay = hc.healthCheckTimeout
- }
- }
- }
-}
-
-// setServingState sets the tablet state to the given value.
-//
-// If the state changes, it logs the change so that failures
-// from the health check connection are logged the first time,
-// but don't continue to log if the connection stays down.
-//
-// hcc.mu must be locked before calling this function
-func (hcc *legacyHealthCheckConn) setServingState(serving bool, reason string) {
- if !hcc.loggedServingState || (serving != hcc.tabletStats.Serving) {
- // Emit the log from a separate goroutine to avoid holding
- // the hcc lock while logging is happening
- go log.Infof("HealthCheckUpdate(Serving State): %v, tablet: %v serving => %v for %v/%v (%v) reason: %s",
- hcc.tabletStats.Name,
- topotools.TabletIdent(hcc.tabletStats.Tablet),
- serving,
- hcc.tabletStats.Tablet.GetKeyspace(),
- hcc.tabletStats.Tablet.GetShard(),
- hcc.tabletStats.Target.GetTabletType(),
- reason,
- )
- hcc.loggedServingState = true
- }
-
- hcc.tabletStats.Serving = serving
-}
-
-// stream streams healthcheck responses to callback.
-func (hcc *legacyHealthCheckConn) stream(ctx context.Context, hc *LegacyHealthCheckImpl, callback func(*querypb.StreamHealthResponse) error) {
- if hcc.conn == nil {
- conn, err := tabletconn.GetDialer()(hcc.tabletStats.Tablet, grpcclient.FailFast(true))
- if err != nil {
- hcc.tabletStats.LastError = err
- return
- }
- hcc.conn = conn
- hcc.tabletStats.LastError = nil
- }
-
- if err := hcc.conn.StreamHealth(ctx, callback); err != nil {
- log.Warningf("tablet %v healthcheck stream error: %v", hcc.tabletStats.Tablet.Alias, err)
- hcc.setServingState(false, err.Error())
- hcc.tabletStats.LastError = err
- // Send nil because we intend to close the connection.
- hc.updateHealth(hcc.tabletStats.Copy(), nil)
- hcc.conn.Close(ctx)
- hcc.conn = nil
- }
-}
-
-// processResponse reads one health check response, and notifies LegacyHealthCheckStatsListener.
-func (hcc *legacyHealthCheckConn) processResponse(hc *LegacyHealthCheckImpl, shr *querypb.StreamHealthResponse) error {
- select {
- case <-hcc.ctx.Done():
- return hcc.ctx.Err()
- default:
- }
-
- // Check for invalid data, better than panicking.
- if shr.Target == nil || shr.RealtimeStats == nil {
- return fmt.Errorf("health stats is not valid: %v", shr)
- }
-
- // an app-level error from tablet, force serving state.
- var healthErr error
- serving := shr.Serving
- if shr.RealtimeStats.HealthError != "" {
- healthErr = fmt.Errorf("vttablet error: %v", shr.RealtimeStats.HealthError)
- serving = false
- }
-
- // hcc.LegacyTabletStats.Tablet.Alias.Uid may be 0 because the youtube internal mechanism uses a different
- // code path to initialize this value. If so, we should skip this check.
- if shr.TabletAlias != nil && hcc.tabletStats.Tablet.Alias.Uid != 0 && !proto.Equal(shr.TabletAlias, hcc.tabletStats.Tablet.Alias) {
- return fmt.Errorf("health stats mismatch, tablet %+v alias does not match response alias %v", hcc.tabletStats.Tablet, shr.TabletAlias)
- }
-
- // In this case where a new tablet is initialized or a tablet type changes, we want to
- // initialize the counter so the rate can be calculated correctly.
- if hcc.tabletStats.Target.TabletType != shr.Target.TabletType {
- hcErrorCounters.Add([]string{shr.Target.Keyspace, shr.Target.Shard, topoproto.TabletTypeLString(shr.Target.TabletType)}, 0)
- }
-
- // Update our record, and notify downstream for tabletType and
- // realtimeStats change.
- hcc.lastResponseTimestamp = time.Now()
- hcc.tabletStats.Target = shr.Target
- hcc.tabletStats.TabletExternallyReparentedTimestamp = shr.TabletExternallyReparentedTimestamp
- hcc.tabletStats.Stats = shr.RealtimeStats
- hcc.tabletStats.LastError = healthErr
- reason := "healthCheck update"
- if healthErr != nil {
- reason = "healthCheck update error: " + healthErr.Error()
- }
- hcc.setServingState(serving, reason)
- hc.updateHealth(hcc.tabletStats.Copy(), hcc.conn)
- return nil
-}
-
-func (hc *LegacyHealthCheckImpl) deleteConn(tablet *topodatapb.Tablet) {
- hc.mu.Lock()
- defer hc.mu.Unlock()
-
- key := TabletToMapKey(tablet)
- th, ok := hc.addrToHealth[key]
- if !ok {
- return
- }
- // Make sure the key still corresponds to the tablet we want to delete.
- // If it doesn't match, we should do nothing. The tablet we were asked to
- // delete is already gone, and some other tablet is using the key
- // (host:port) that the original tablet used to use, which is fine.
- if !topoproto.TabletAliasEqual(tablet.Alias, th.latestTabletStats.Tablet.Alias) {
- return
- }
- hc.deleteConnLocked(key, th)
-}
-
-func (hc *LegacyHealthCheckImpl) deleteConnLocked(key string, th *legacyTabletHealth) {
- th.latestTabletStats.Up = false
- th.cancelFunc()
- delete(hc.addrToHealth, key)
-}
-
-// SetListener sets the listener for healthcheck updates.
-// It must be called after NewLegacyHealthCheck and before any tablets are added
-// (either through AddTablet or through a Watcher).
-func (hc *LegacyHealthCheckImpl) SetListener(listener LegacyHealthCheckStatsListener, sendDownEvents bool) {
- if hc.listener != nil {
- panic("must not call SetListener twice")
- }
-
- hc.mu.Lock()
- defer hc.mu.Unlock()
- if len(hc.addrToHealth) > 0 {
- panic("must not call SetListener after tablets were added")
- }
-
- hc.listener = listener
- hc.sendDownEvents = sendDownEvents
-}
-
-// AddTablet adds the tablet, and starts health check.
-// It does not block on making connection.
-// name is an optional tag for the tablet, e.g. an alternative address.
-func (hc *LegacyHealthCheckImpl) AddTablet(tablet *topodatapb.Tablet, name string) {
- ctx, cancelFunc := context.WithCancel(context.Background())
- key := TabletToMapKey(tablet)
- hcc := &legacyHealthCheckConn{
- ctx: ctx,
- tabletStats: LegacyTabletStats{
- Key: key,
- Tablet: tablet,
- Name: name,
- Target: &querypb.Target{},
- Up: true,
- },
- }
- hc.mu.Lock()
- if hc.addrToHealth == nil {
- // already closed.
- hc.mu.Unlock()
- cancelFunc()
- return
- }
- if th, ok := hc.addrToHealth[key]; ok {
- // Something already exists at this key.
- // If it's the same tablet, something is wrong.
- if topoproto.TabletAliasEqual(th.latestTabletStats.Tablet.Alias, tablet.Alias) {
- hc.mu.Unlock()
- log.Warningf("refusing to add duplicate tablet %v for %v: %+v", name, tablet.Alias.Cell, tablet)
- cancelFunc()
- return
- }
- // If it's a different tablet, then we trust this new tablet that claims
- // it has taken over the host:port that the old tablet used to be on.
- // Remove the old tablet to clear the way.
- hc.deleteConnLocked(key, th)
- }
- hc.addrToHealth[key] = &legacyTabletHealth{
- cancelFunc: cancelFunc,
- latestTabletStats: hcc.tabletStats,
- }
- hc.initialUpdatesWG.Add(1)
- hc.connsWG.Add(1)
- hc.mu.Unlock()
-
- go hc.checkConn(hcc, name)
-}
-
-// RemoveTablet removes the tablet, and stops the health check.
-// It does not block.
-func (hc *LegacyHealthCheckImpl) RemoveTablet(tablet *topodatapb.Tablet) {
- hc.deleteConn(tablet)
-}
-
-// ReplaceTablet removes the old tablet and adds the new tablet.
-func (hc *LegacyHealthCheckImpl) ReplaceTablet(old, new *topodatapb.Tablet, name string) {
- hc.deleteConn(old)
- hc.AddTablet(new, name)
-}
-
-// WaitForInitialStatsUpdates waits until all tablets added via AddTablet() call
-// were propagated to downstream via corresponding StatsUpdate() calls.
-func (hc *LegacyHealthCheckImpl) WaitForInitialStatsUpdates() {
- hc.initialUpdatesWG.Wait()
-}
-
-// GetConnection returns the TabletConn of the given tablet.
-func (hc *LegacyHealthCheckImpl) GetConnection(key string) queryservice.QueryService {
- hc.mu.Lock()
- defer hc.mu.Unlock()
-
- th := hc.addrToHealth[key]
- if th == nil {
- return nil
- }
- return th.conn
-}
-
-// LegacyTabletsCacheStatus is the current tablets for a cell/target.
-type LegacyTabletsCacheStatus struct {
- Cell string
- Target *querypb.Target
- TabletsStats LegacyTabletStatsList
-}
-
-// LegacyTabletStatsList is used for sorting.
-type LegacyTabletStatsList []*LegacyTabletStats
-
-// Len is part of sort.Interface.
-func (tsl LegacyTabletStatsList) Len() int {
- return len(tsl)
-}
-
-// Less is part of sort.Interface
-func (tsl LegacyTabletStatsList) Less(i, j int) bool {
- name1 := tsl[i].Name
- if name1 == "" {
- name1 = tsl[i].Key
- }
- name2 := tsl[j].Name
- if name2 == "" {
- name2 = tsl[j].Key
- }
- return name1 < name2
-}
-
-// Swap is part of sort.Interface
-func (tsl LegacyTabletStatsList) Swap(i, j int) {
- tsl[i], tsl[j] = tsl[j], tsl[i]
-}
-
-// StatusAsHTML returns an HTML version of the status.
-func (tcs *LegacyTabletsCacheStatus) StatusAsHTML() template.HTML {
- tLinks := make([]string, 0, 1)
- if tcs.TabletsStats != nil {
- sort.Sort(tcs.TabletsStats)
- }
- for _, ts := range tcs.TabletsStats {
- color := "green"
- extra := ""
- if ts.LastError != nil {
- color = "red"
- extra = fmt.Sprintf(" (%v)", ts.LastError)
- } else if !ts.Serving {
- color = "red"
- extra = " (Not Serving)"
- } else if !ts.Up {
- color = "red"
- extra = " (Down)"
- } else if ts.Target.TabletType == topodatapb.TabletType_PRIMARY {
- extra = fmt.Sprintf(" (PrimaryTermStartTime: %v)", ts.TabletExternallyReparentedTimestamp)
- } else {
- extra = fmt.Sprintf(" (RepLag: %v)", ts.Stats.ReplicationLagSeconds)
- }
- name := ts.Name
- if name == "" {
- name = ts.GetTabletHostPort()
- }
- tLinks = append(tLinks, fmt.Sprintf(`%v%v`, ts.getTabletDebugURL(), color, name, extra))
- }
- return template.HTML(strings.Join(tLinks, "
"))
-}
-
-// LegacyTabletsCacheStatusList is used for sorting.
-type LegacyTabletsCacheStatusList []*LegacyTabletsCacheStatus
-
-// Len is part of sort.Interface.
-func (tcsl LegacyTabletsCacheStatusList) Len() int {
- return len(tcsl)
-}
-
-// Less is part of sort.Interface
-func (tcsl LegacyTabletsCacheStatusList) Less(i, j int) bool {
- return tcsl[i].Cell+"."+tcsl[i].Target.Keyspace+"."+tcsl[i].Target.Shard+"."+string(tcsl[i].Target.TabletType) <
- tcsl[j].Cell+"."+tcsl[j].Target.Keyspace+"."+tcsl[j].Target.Shard+"."+string(tcsl[j].Target.TabletType)
-}
-
-// Swap is part of sort.Interface
-func (tcsl LegacyTabletsCacheStatusList) Swap(i, j int) {
- tcsl[i], tcsl[j] = tcsl[j], tcsl[i]
-}
-
-// CacheStatus returns a displayable version of the cache.
-func (hc *LegacyHealthCheckImpl) CacheStatus() LegacyTabletsCacheStatusList {
- tcsMap := hc.cacheStatusMap()
- tcsl := make(LegacyTabletsCacheStatusList, 0, len(tcsMap))
- for _, tcs := range tcsMap {
- tcsl = append(tcsl, tcs)
- }
- sort.Sort(tcsl)
- return tcsl
-}
-
-func (hc *LegacyHealthCheckImpl) cacheStatusMap() map[string]*LegacyTabletsCacheStatus {
- tcsMap := make(map[string]*LegacyTabletsCacheStatus)
- hc.mu.Lock()
- defer hc.mu.Unlock()
- for _, th := range hc.addrToHealth {
- key := fmt.Sprintf("%v.%v.%v.%v", th.latestTabletStats.Tablet.Alias.Cell, th.latestTabletStats.Target.Keyspace, th.latestTabletStats.Target.Shard, th.latestTabletStats.Target.TabletType.String())
- var tcs *LegacyTabletsCacheStatus
- var ok bool
- if tcs, ok = tcsMap[key]; !ok {
- tcs = &LegacyTabletsCacheStatus{
- Cell: th.latestTabletStats.Tablet.Alias.Cell,
- Target: th.latestTabletStats.Target,
- }
- tcsMap[key] = tcs
- }
- tabletStats := th.latestTabletStats
- tcs.TabletsStats = append(tcs.TabletsStats, &tabletStats)
- }
- return tcsMap
-}
-
-// Close stops the healthcheck.
-// After Close() returned, it's guaranteed that the listener isn't
-// currently executing and won't be called again.
-func (hc *LegacyHealthCheckImpl) Close() error {
- hc.mu.Lock()
- for _, th := range hc.addrToHealth {
- th.cancelFunc()
- }
- hc.addrToHealth = nil
- // Release the lock early or a pending checkHealthCheckTimeout
- // cannot get a read lock on it.
- hc.mu.Unlock()
-
- // Wait for the checkHealthCheckTimeout Go routine and each Go
- // routine per tablet.
- hc.connsWG.Wait()
-
- return nil
-}
-
-// TabletToMapKey creates a key to the map from tablet's host and ports.
-// It should only be used in discovery and related module.
-func TabletToMapKey(tablet *topodatapb.Tablet) string {
- parts := make([]string, 0, 1)
- for name, port := range tablet.PortMap {
- parts = append(parts, netutil.JoinHostPort(name, port))
- }
- sort.Strings(parts)
- parts = append([]string{tablet.Hostname}, parts...)
- return strings.Join(parts, ",")
-}
diff --git a/go/vt/discovery/legacy_healthcheck_flaky_test.go b/go/vt/discovery/legacy_healthcheck_flaky_test.go
deleted file mode 100644
index 33ac51c3247..00000000000
--- a/go/vt/discovery/legacy_healthcheck_flaky_test.go
+++ /dev/null
@@ -1,615 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "bytes"
- "flag"
- "fmt"
- "html/template"
- "strings"
- "testing"
- "time"
-
- "vitess.io/vitess/go/test/utils"
-
- "context"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
- "vitess.io/vitess/go/vt/status"
- "vitess.io/vitess/go/vt/topo"
-)
-
-func testChecksum(t *testing.T, want, got int64) {
- t.Helper()
- if want != got {
- t.Errorf("want checksum %v, got %v", want, got)
- }
-}
-
-func TestLegacyHealthCheck(t *testing.T) {
- hcErrorCounters.ResetAll()
- tablet := topo.NewTablet(0, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse)
- createFakeConn(tablet, input)
- t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`)
- l := newListener()
- hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl)
- hc.SetListener(l, true)
- testChecksum(t, 0, hc.stateChecksum())
- hc.AddTablet(tablet, "")
- t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`)
-
- // Immediately after AddTablet() there will be the first notification.
- want := &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{},
- Up: true,
- Serving: false,
- }
- res := <-l.output
- utils.MustMatch(t, want, res)
- testChecksum(t, 401258919, hc.stateChecksum())
-
- // one tablet after receiving a StreamHealthResponse
- shr := &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Serving: true,
-
- TabletExternallyReparentedTimestamp: 10,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
-
- TabletExternallyReparentedTimestamp: 10,
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`)
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- // Verify that the error count is initialized to 0 after the first tablet response.
- if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0); err != nil {
- t.Errorf("%v", err)
- }
-
- tcsl := hc.CacheStatus()
- tcslWant := LegacyTabletsCacheStatusList{{
- Cell: "cell",
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- TabletsStats: LegacyTabletStatsList{{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
-
- TabletExternallyReparentedTimestamp: 10,
- }},
- }}
- utils.MustMatch(t, tcslWant, tcsl)
- testChecksum(t, 4163049392, hc.stateChecksum())
-
- // TabletType changed, should get both old and new event
- shr = &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Serving: true,
-
- TabletExternallyReparentedTimestamp: 0,
-
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, Serving: true, TabletExternallyReparentedTimestamp: 0, {ReplicationLagSeconds: 1, CpuUsage: 0.5}}`)
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Up: false,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- TabletExternallyReparentedTimestamp: 10,
- }
- res = <-l.output
- utils.MustMatch(t, want, res)
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
- TabletExternallyReparentedTimestamp: 0,
- }
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- if err := checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 0); err != nil {
- t.Errorf("%v", err)
- }
- testChecksum(t, 1906892404, hc.stateChecksum())
-
- // Serving & RealtimeStats changed
- shr = &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Serving: false,
- TabletExternallyReparentedTimestamp: 0,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: false,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
- TabletExternallyReparentedTimestamp: 0,
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, TabletExternallyReparentedTimestamp: 0, {ReplicationLagSeconds: 1, CpuUsage: 0.3}}`)
- res = <-l.output
- utils.MustMatch(t, want, res)
- testChecksum(t, 1200695592, hc.stateChecksum())
-
- // HealthError
- shr = &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Serving: true,
- TabletExternallyReparentedTimestamp: 0,
- RealtimeStats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: false,
- Stats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3},
- TabletExternallyReparentedTimestamp: 0,
- LastError: fmt.Errorf("vttablet error: some error"),
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: REPLICA}, Serving: true, TabletExternallyReparentedTimestamp: 0, {HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}}`)
- res = <-l.output
- utils.MustMatch(t, want, res)
- testChecksum(t, 1200695592, hc.stateChecksum()) // unchanged
-
- // remove tablet
- hc.deleteConn(tablet)
- t.Logf(`hc.RemoveTablet({Host: "a", PortMap: {"vt": 1}})`)
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: false,
- Serving: false,
- Stats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3},
- TabletExternallyReparentedTimestamp: 0,
- LastError: context.Canceled,
- }
- res = <-l.output
- utils.MustMatch(t, want, res)
- testChecksum(t, 0, hc.stateChecksum())
-
- // close healthcheck
- hc.Close()
-}
-
-func TestLegacyHealthCheckStreamError(t *testing.T) {
- tablet := topo.NewTablet(0, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse)
- fc := createFakeConn(tablet, input)
- fc.errCh = make(chan error)
- t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`)
- l := newListener()
- hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl)
- hc.SetListener(l, true)
- hc.AddTablet(tablet, "")
- t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`)
-
- // Immediately after AddTablet() there will be the first notification.
- want := &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{},
- Up: true,
- Serving: false,
- }
- res := <-l.output
- utils.MustMatch(t, want, res)
-
- // one tablet after receiving a StreamHealthResponse
- shr := &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Serving: true,
- TabletExternallyReparentedTimestamp: 0,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- TabletExternallyReparentedTimestamp: 0,
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`)
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- // Stream error
- fc.errCh <- fmt.Errorf("some stream error")
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: false,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- TabletExternallyReparentedTimestamp: 0,
- LastError: fmt.Errorf("some stream error"),
- }
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- // close healthcheck
- hc.Close()
-}
-
-func TestLegacyHealthCheckVerifiesTabletAlias(t *testing.T) {
- t.Logf("starting")
- tablet := topo.NewTablet(1, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse, 1)
- fc := createFakeConn(tablet, input)
-
- t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`)
-
- l := newListener()
- hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl)
- hc.SetListener(l, false)
- hc.AddTablet(tablet, "")
- t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`)
-
- // Immediately after AddTablet() there will be the first notification.
- want := &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{},
- Up: true,
- Serving: false,
- }
- res := <-l.output
- utils.MustMatch(t, want, res)
-
- input <- &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- TabletAlias: &topodatapb.TabletAlias{Uid: 20, Cell: "cellb"},
- Serving: true,
- TabletExternallyReparentedTimestamp: 10,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
-
- select {
- case err := <-fc.cbErrCh:
- t.Logf("<-fc.cbErrCh: %v", err)
- if prefix := "health stats mismatch"; !strings.HasPrefix(err.Error(), prefix) {
- t.Fatalf("wrong error, got %v; want prefix %v", err, prefix)
- }
- case <-l.output:
- t.Fatalf("StreamHealth should have returned a health stats mismatch error")
- }
-
- input <- &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- TabletAlias: &topodatapb.TabletAlias{Uid: 1, Cell: "cell"},
- Serving: true,
- TabletExternallyReparentedTimestamp: 10,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
-
- select {
- case err := <-fc.cbErrCh:
- t.Fatalf("wanted listener output, got error: %v", err)
- case res := <-l.output:
- t.Logf("<-l.output: %+v", res)
- }
-
- // close healthcheck
- hc.Close()
-}
-
-// TestLegacyHealthCheckCloseWaitsForGoRoutines tests that Close() waits for all Go
-// routines to finish and the listener won't be called anymore.
-func TestLegacyHealthCheckCloseWaitsForGoRoutines(t *testing.T) {
- tablet := topo.NewTablet(0, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse, 1)
- createFakeConn(tablet, input)
-
- t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`)
-
- l := newListener()
- hc := NewLegacyHealthCheck(1*time.Millisecond, time.Hour).(*LegacyHealthCheckImpl)
- hc.SetListener(l, false)
- hc.AddTablet(tablet, "")
- t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`)
-
- // Immediately after AddTablet() there will be the first notification.
- want := &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{},
- Up: true,
- Serving: false,
- }
- res := <-l.output
- utils.MustMatch(t, want, res)
-
- // Verify that the listener works in general.
- shr := &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Serving: true,
- TabletExternallyReparentedTimestamp: 10,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- TabletExternallyReparentedTimestamp: 10,
- }
- input <- shr
- t.Logf(`input <- %v`, shr)
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- // Change input to distinguish between stats sent before and after Close().
- shr.TabletExternallyReparentedTimestamp = 11
- // Close the healthcheck. Tablet connections are closed asynchronously and
- // Close() will block until all Go routines (one per connection) are done.
- hc.Close()
-
- // Try to send more updates. They should be ignored and the listener should
- // not be called from any Go routine anymore.
- // Note that this code is racy by nature. If there is a regression, it should
- // fail in some cases.
- input <- shr
- t.Logf(`input <- %v`, shr)
-
- // After Close() we'll receive one or two notifications with Serving == false.
- res = <-l.output
- if res.Serving {
- t.Errorf(`Received one more notification with Serving == true: %+v`, res)
- }
-
- select {
- case res = <-l.output:
- if res.TabletExternallyReparentedTimestamp == 10 && res.LastError == context.Canceled {
- // LegacyHealthCheck repeats the previous stats if there is an error.
- // This is expected.
- break
- }
- t.Fatalf("healthCheck still running after Close(): listener received: %v but should not have been called", res)
- case <-time.After(1 * time.Millisecond):
- // No response after timeout. Close probably closed all Go routines
- // properly and won't use the listener anymore.
- }
-
- // The last notification should have Up = false.
- if res.Up || res.Serving {
- t.Errorf(`Last notification doesn't have Up == false and Serving == false: %+v`, res)
- }
-
- // Check if there are more updates than the one emitted during Close().
- select {
- case res := <-l.output:
- t.Fatalf("healthCheck still running after Close(): listener received: %v but should not have been called", res)
- case <-time.After(1 * time.Millisecond):
- // No response after timeout. Listener probably not called again. Success.
- }
-}
-
-func TestLegacyHealthCheckTimeout(t *testing.T) {
- timeout := 500 * time.Millisecond
- tablet := topo.NewTablet(0, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse)
- fc := createFakeConn(tablet, input)
- t.Logf(`createFakeConn({Host: "a", PortMap: {"vt": 1}}, c)`)
- l := newListener()
- hc := NewLegacyHealthCheck(1*time.Millisecond, timeout).(*LegacyHealthCheckImpl)
- hc.SetListener(l, false)
- hc.AddTablet(tablet, "")
- t.Logf(`hc = LegacyHealthCheck(); hc.AddTablet({Host: "a", PortMap: {"vt": 1}}, "")`)
-
- // Immediately after AddTablet() there will be the first notification.
- want := &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{},
- Up: true,
- Serving: false,
- }
- res := <-l.output
- utils.MustMatch(t, want, res)
-
- // one tablet after receiving a StreamHealthResponse
- shr := &querypb.StreamHealthResponse{
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Serving: true,
- TabletExternallyReparentedTimestamp: 10,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- want = &LegacyTabletStats{
- Key: "a,vt:1",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- TabletExternallyReparentedTimestamp: 10,
- }
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`)
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0); err != nil {
- t.Errorf("%v", err)
- }
-
- // wait for timeout period
- time.Sleep(2 * timeout)
- t.Logf(`Sleep(2 * timeout)`)
- res = <-l.output
- if res.Serving {
- t.Errorf(`<-l.output: %+v; want not serving`, res)
- }
-
- if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 1); err != nil {
- t.Errorf("%v", err)
- }
-
- if !fc.isCanceled() {
- t.Errorf("StreamHealth should be canceled after timeout, but is not")
- }
-
- // repeat the wait. It will timeout one more time trying to get the connection.
- fc.resetCanceledFlag()
- time.Sleep(timeout)
- t.Logf(`Sleep(2 * timeout)`)
-
- res = <-l.output
- if res.Serving {
- t.Errorf(`<-l.output: %+v; want not serving`, res)
- }
-
- if err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 2); err != nil {
- t.Errorf("%v", err)
- }
-
- if !fc.isCanceled() {
- t.Errorf("StreamHealth should be canceled again after timeout")
- }
-
- // send a healthcheck response, it should be serving again
- fc.resetCanceledFlag()
- input <- shr
- t.Logf(`input <- {{Keyspace: "k", Shard: "s", TabletType: PRIMARY}, Serving: true, TabletExternallyReparentedTimestamp: 10, {ReplicationLagSeconds: 1, CpuUsage: 0.2}}`)
-
- // wait for the exponential backoff to wear off and health monitoring to resume.
- time.Sleep(timeout)
- res = <-l.output
- utils.MustMatch(t, want, res)
-
- // close healthcheck
- hc.Close()
-}
-
-func TestLegacyTemplate(t *testing.T) {
- tablet := topo.NewTablet(0, "cell", "a")
- ts := []*LegacyTabletStats{
- {
- Key: "a",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: false,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
- TabletExternallyReparentedTimestamp: 0,
- },
- }
- tcs := &LegacyTabletsCacheStatus{
- Cell: "cell",
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- TabletsStats: ts,
- }
- templ := template.New("").Funcs(status.StatusFuncs)
- templ, err := templ.Parse(LegacyHealthCheckTemplate)
- if err != nil {
- t.Fatalf("error parsing template: %v", err)
- }
- wr := &bytes.Buffer{}
- if err := templ.Execute(wr, []*LegacyTabletsCacheStatus{tcs}); err != nil {
- t.Fatalf("error executing template: %v", err)
- }
-}
-
-func TestLegacyDebugURLFormatting(t *testing.T) {
- flag.Set("tablet_url_template", "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp")
- ParseTabletURLTemplateFromFlag()
-
- tablet := topo.NewTablet(0, "cell", "host.dc.domain")
- ts := []*LegacyTabletStats{
- {
- Key: "a",
- Tablet: tablet,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: false,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
- TabletExternallyReparentedTimestamp: 0,
- },
- }
- tcs := &LegacyTabletsCacheStatus{
- Cell: "cell",
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- TabletsStats: ts,
- }
- templ := template.New("").Funcs(status.StatusFuncs)
- templ, err := templ.Parse(LegacyHealthCheckTemplate)
- if err != nil {
- t.Fatalf("error parsing template: %v", err)
- }
- wr := &bytes.Buffer{}
- if err := templ.Execute(wr, []*LegacyTabletsCacheStatus{tcs}); err != nil {
- t.Fatalf("error executing template: %v", err)
- }
- expectedURL := `"https://host.bastion.cell.corp"`
- if !strings.Contains(wr.String(), expectedURL) {
- t.Fatalf("output missing formatted URL, expectedURL: %s , output: %s", expectedURL, wr.String())
- }
-}
-
-type listener struct {
- output chan *LegacyTabletStats
-}
-
-func newListener() *listener {
- return &listener{output: make(chan *LegacyTabletStats, 2)}
-}
-
-func (l *listener) StatsUpdate(ts *LegacyTabletStats) {
- l.output <- ts
-}
diff --git a/go/vt/discovery/legacy_replicationlag.go b/go/vt/discovery/legacy_replicationlag.go
deleted file mode 100644
index 31dd5eaa4e8..00000000000
--- a/go/vt/discovery/legacy_replicationlag.go
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "fmt"
- "sort"
-)
-
-// LegacyIsReplicationLagHigh verifies that the given LegacyTabletStats refers to a tablet with high
-// replication lag, i.e. higher than the configured discovery_low_replication_lag flag.
-func LegacyIsReplicationLagHigh(tabletStats *LegacyTabletStats) bool {
- return float64(tabletStats.Stats.ReplicationLagSeconds) > lowReplicationLag.Seconds()
-}
-
-// LegacyIsReplicationLagVeryHigh verifies that the given LegacyTabletStats refers to a tablet with very high
-// replication lag, i.e. higher than the configured discovery_high_replication_lag_minimum_serving flag.
-func LegacyIsReplicationLagVeryHigh(tabletStats *LegacyTabletStats) bool {
- return float64(tabletStats.Stats.ReplicationLagSeconds) > highReplicationLagMinServing.Seconds()
-}
-
-// FilterLegacyStatsByReplicationLag filters the list of LegacyTabletStats by LegacyTabletStats.Stats.ReplicationLagSeconds.
-// Note that LegacyTabletStats that is non-serving or has error is ignored.
-//
-// The simplified logic:
-// - Return tablets that have lag <= lowReplicationLag.
-// - Make sure we return at least minNumTablets tablets, if there are enough one with lag <= highReplicationLagMinServing.
-// For example, with the default of 30s / 2h / 2, this means:
-// - lags of (5s, 10s, 15s, 120s) return the first three
-// - lags of (30m, 35m, 40m, 45m) return the first two
-// - lags of (2h, 3h, 4h, 5h) return the first one
-//
-// The legacy algorithm (default for now):
-// - Return the list if there is 0 or 1 tablet.
-// - Return the list if all tablets have <=30s lag.
-// - Filter by replication lag: for each tablet, if the mean value without it is more than 0.7 of the mean value across all tablets, it is valid.
-// - Make sure we return at least minNumTablets tablets (if there are enough one with only low replication lag).
-// - If one tablet is removed, run above steps again in case there are two tablets with high replication lag. (It should cover most cases.)
-// For example, lags of (5s, 10s, 15s, 120s) return the first three;
-// lags of (30m, 35m, 40m, 45m) return all.
-//
-// One thing to know about this code: vttablet also has a couple flags that impact the logic here:
-// * unhealthy_threshold: if replication lag is higher than this, a tablet will be reported as unhealthy.
-// The default for this is 2h, same as the discovery_high_replication_lag_minimum_serving here.
-// * degraded_threshold: this is only used by vttablet for display. It should match
-// discovery_low_replication_lag here, so the vttablet status display matches what vtgate will do of it.
-func FilterLegacyStatsByReplicationLag(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats {
- if !*legacyReplicationLagAlgorithm {
- return filterLegacyStatsByLag(tabletStatsList)
- }
-
- res := filterLegacyStatsByLagWithLegacyAlgorithm(tabletStatsList)
- // run the filter again if exactly one tablet is removed,
- // and we have spare tablets.
- if len(res) > *minNumTablets && len(res) == len(tabletStatsList)-1 {
- res = filterLegacyStatsByLagWithLegacyAlgorithm(res)
- }
- return res
-}
-
-func filterLegacyStatsByLag(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats {
- list := make([]legacyTabletLagSnapshot, 0, len(tabletStatsList))
- // filter non-serving tablets and those with very high replication lag
- for _, ts := range tabletStatsList {
- if !ts.Serving || ts.LastError != nil || ts.Stats == nil || LegacyIsReplicationLagVeryHigh(ts) {
- continue
- }
- // Pull the current replication lag for a stable sort later.
- list = append(list, legacyTabletLagSnapshot{
- ts: ts,
- replag: ts.Stats.ReplicationLagSeconds})
- }
-
- // Sort by replication lag.
- sort.Sort(byLegacyReplag(list))
-
- // Pick those with low replication lag, but at least minNumTablets tablets regardless.
- res := make([]*LegacyTabletStats, 0, len(list))
- for i := 0; i < len(list); i++ {
- if !LegacyIsReplicationLagHigh(list[i].ts) || i < *minNumTablets {
- res = append(res, list[i].ts)
- }
- }
- return res
-}
-
-func filterLegacyStatsByLagWithLegacyAlgorithm(tabletStatsList []*LegacyTabletStats) []*LegacyTabletStats {
- list := make([]*LegacyTabletStats, 0, len(tabletStatsList))
- // filter non-serving tablets
- for _, ts := range tabletStatsList {
- if !ts.Serving || ts.LastError != nil || ts.Stats == nil {
- continue
- }
- list = append(list, ts)
- }
- if len(list) <= 1 {
- return list
- }
- // if all have low replication lag (<=30s), return all tablets.
- allLowLag := true
- for _, ts := range list {
- if LegacyIsReplicationLagHigh(ts) {
- allLowLag = false
- break
- }
- }
- if allLowLag {
- return list
- }
- // filter those affecting "mean" lag significantly
- // calculate mean for all tablets
- res := make([]*LegacyTabletStats, 0, len(list))
- m, _ := legacyMean(list, -1)
- for i, ts := range list {
- // calculate mean by excluding ith tablet
- mi, _ := legacyMean(list, i)
- if float64(mi) > float64(m)*0.7 {
- res = append(res, ts)
- }
- }
- if len(res) >= *minNumTablets {
- return res
- }
- // return at least minNumTablets tablets to avoid over loading,
- // if there is enough tablets with replication lag < highReplicationLagMinServing.
- // Pull the current replication lag for a stable sort.
- snapshots := make([]legacyTabletLagSnapshot, 0, len(list))
- for _, ts := range list {
- if !LegacyIsReplicationLagVeryHigh(ts) {
- snapshots = append(snapshots, legacyTabletLagSnapshot{
- ts: ts,
- replag: ts.Stats.ReplicationLagSeconds})
- }
- }
- if len(snapshots) == 0 {
- // We get here if all tablets are over the high
- // replication lag threshold, and their lag is
- // different enough that the 70% mean computation up
- // there didn't find them all in a group. For
- // instance, if *minNumTablets = 2, and we have two
- // tablets with lag of 3h and 30h. In that case, we
- // just use them all.
- for _, ts := range list {
- snapshots = append(snapshots, legacyTabletLagSnapshot{
- ts: ts,
- replag: ts.Stats.ReplicationLagSeconds})
- }
- }
-
- // Sort by replication lag.
- sort.Sort(byLegacyReplag(snapshots))
-
- // Pick the first minNumTablets tablets.
- res = make([]*LegacyTabletStats, 0, *minNumTablets)
- for i := 0; i < min(*minNumTablets, len(snapshots)); i++ {
- res = append(res, snapshots[i].ts)
- }
- return res
-}
-
-type legacyTabletLagSnapshot struct {
- ts *LegacyTabletStats
- replag uint32
-}
-type byLegacyReplag []legacyTabletLagSnapshot
-
-func (a byLegacyReplag) Len() int { return len(a) }
-func (a byLegacyReplag) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
-func (a byLegacyReplag) Less(i, j int) bool { return a[i].replag < a[j].replag }
-
-// mean calculates the mean value over the given list,
-// while excluding the item with the specified index.
-func legacyMean(tabletStatsList []*LegacyTabletStats, idxExclude int) (uint64, error) {
- var sum uint64
- var count uint64
- for i, ts := range tabletStatsList {
- if i == idxExclude {
- continue
- }
- sum = sum + uint64(ts.Stats.ReplicationLagSeconds)
- count++
- }
- if count == 0 {
- return 0, fmt.Errorf("empty list")
- }
- return sum / count, nil
-}
diff --git a/go/vt/discovery/legacy_replicationlag_test.go b/go/vt/discovery/legacy_replicationlag_test.go
deleted file mode 100644
index 0033b0370ee..00000000000
--- a/go/vt/discovery/legacy_replicationlag_test.go
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "fmt"
- "testing"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- "vitess.io/vitess/go/vt/topo"
-)
-
-// testSetLegacyReplicationLagAlgorithm is a test helper function, if this is used by a production code path, something is wrong.
-func testSetLegacyReplicationLagAlgorithm(newLegacy bool) {
- *legacyReplicationLagAlgorithm = newLegacy
-}
-
-func TestFilterLegacyStatsByReplicationLagUnhealthy(t *testing.T) {
- // 1 healthy serving tablet, 1 not healhty
- ts1 := &LegacyTabletStats{
- Tablet: topo.NewTablet(1, "cell", "host1"),
- Serving: true,
- Stats: &querypb.RealtimeStats{},
- }
- ts2 := &LegacyTabletStats{
- Tablet: topo.NewTablet(2, "cell", "host2"),
- Serving: false,
- Stats: &querypb.RealtimeStats{},
- }
- got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2})
- if len(got) != 1 {
- t.Errorf("len(FilterLegacyStatsByReplicationLag([{Tablet: {Uid: 1}, Serving: true}, {Tablet: {Uid: 2}, Serving: false}])) = %v, want 1", len(got))
- }
- if len(got) > 0 && !got[0].DeepEqual(ts1) {
- t.Errorf("FilterLegacyStatsByReplicationLag([{Tablet: {Uid: 1}, Serving: true}, {Tablet: {Uid: 2}, Serving: false}]) = %+v, want %+v", got[0], ts1)
- }
-}
-
-func TestFilterLegacyStatsByReplicationLag(t *testing.T) {
- // Use simplified logic
- testSetLegacyReplicationLagAlgorithm(false)
-
- cases := []struct {
- description string
- input []uint32
- output []uint32
- }{
- {
- "0 tablet",
- []uint32{},
- []uint32{},
- },
- {
- "lags of (1s) - return all items with low lag.",
- []uint32{1},
- []uint32{1},
- },
- {
- "lags of (1s, 1s, 1s, 30s) - return all items with low lag.",
- []uint32{1, 1, 1, 30},
- []uint32{1, 1, 1, 30},
- },
- {
- "lags of (1s, 1s, 1s, 40m, 40m, 40m) - return all items with low lag.",
- []uint32{1, 1, 1, 40 * 60, 40 * 60, 40 * 60},
- []uint32{1, 1, 1},
- },
- {
- "lags of (1s, 40m, 40m, 40m) - return at least 2 items if they don't have very high lag.",
- []uint32{1, 40 * 60, 40 * 60, 40 * 60},
- []uint32{1, 40 * 60},
- },
- {
- "lags of (30m, 35m, 40m, 45m) - return at least 2 items if they don't have very high lag.",
- []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60},
- []uint32{30 * 60, 35 * 60},
- },
- {
- "lags of (2h, 3h, 4h, 5h) - return <2 items if the others have very high lag.",
- []uint32{2 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60, 5 * 60 * 60},
- []uint32{2 * 60 * 60},
- },
- {
- "lags of (3h, 30h) - return nothing if all have very high lag.",
- []uint32{3 * 60 * 60, 30 * 60 * 60},
- []uint32{},
- },
- }
-
- for _, tc := range cases {
- lts := make([]*LegacyTabletStats, len(tc.input))
- for i, lag := range tc.input {
- lts[i] = &LegacyTabletStats{
- Tablet: topo.NewTablet(uint32(i+1), "cell", fmt.Sprintf("host-%vs-behind", lag)),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: lag},
- }
- }
- got := FilterLegacyStatsByReplicationLag(lts)
- if len(got) != len(tc.output) {
- t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected: %v", tc.description, got, tc.output)
- continue
- }
- for i, elag := range tc.output {
- if got[i].Stats.ReplicationLagSeconds != elag {
- t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected value index %v to be %v", tc.description, got, i, elag)
- }
- }
- }
-
- // Reset to the default
- testSetLegacyReplicationLagAlgorithm(true)
-}
-
-func TestFilterLegacyStatysByReplicationLagWithLegacyAlgorithm(t *testing.T) {
- // Use legacy algorithm by default for now
-
- cases := []struct {
- description string
- input []uint32
- output []uint32
- }{
- {
- "0 tablet",
- []uint32{},
- []uint32{},
- },
- {
- "1 serving tablet",
- []uint32{1},
- []uint32{1},
- },
- {
- "lags of (1s, 1s, 1s, 30s)",
- []uint32{1, 1, 1, 30},
- []uint32{1, 1, 1, 30},
- },
- {
- "lags of (30m, 35m, 40m, 45m)",
- []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60},
- []uint32{30 * 60, 35 * 60, 40 * 60, 45 * 60},
- },
- {
- "lags of (1s, 1s, 1m, 40m, 40m) - not run filter the second time as first run removed two items.",
- []uint32{1, 1, 60, 40 * 60, 40 * 60},
- []uint32{1, 1, 60},
- },
- {
- "lags of (1s, 1s, 10m, 40m) - run filter twice to remove two items",
- []uint32{1, 1, 10 * 60, 40 * 60},
- []uint32{1, 1},
- },
- {
- "lags of (1m, 100m) - return at least 2 items to avoid overloading if the 2nd one is not delayed too much.",
- []uint32{1 * 60, 100 * 60},
- []uint32{1 * 60, 100 * 60},
- },
- {
- "lags of (1m, 3h) - return 1 if the 2nd one is delayed too much.",
- []uint32{1 * 60, 3 * 60 * 60},
- []uint32{1 * 60},
- },
- {
- "lags of (3h) - return 1 as they're all delayed too much.",
- []uint32{3 * 60 * 60},
- []uint32{3 * 60 * 60},
- },
- {
- "lags of (3h, 4h) - return 2 as they're all delayed too much, but still in a good group.",
- []uint32{3 * 60 * 60, 4 * 60 * 60},
- []uint32{3 * 60 * 60, 4 * 60 * 60},
- },
- {
- "lags of (3h, 3h, 4h) - return 3 as they're all delayed too much, but still in a good group.",
- []uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60},
- []uint32{3 * 60 * 60, 3 * 60 * 60, 4 * 60 * 60},
- },
- {
- "lags of (3h, 15h, 18h) - return 3 as they're all delayed too much, but still in a good group." +
- "(different test case than above to show how absurb the good group logic is)",
- []uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60},
- []uint32{3 * 60 * 60, 15 * 60 * 60, 18 * 60 * 60},
- },
- {
- "lags of (3h, 12h, 18h) - return 2 as they're all delayed too much, but 18h is now considered an outlier." +
- "(different test case than above to show how absurb the good group logic is)",
- []uint32{3 * 60 * 60, 12 * 60 * 60, 18 * 60 * 60},
- []uint32{3 * 60 * 60, 12 * 60 * 60},
- },
- {
- "lags of (3h, 30h) - return 2 as they're all delayed too much." +
- "(different test case that before, as both tablet stats are" +
- "widely different, not within 70% of eachother)",
- []uint32{3 * 60 * 60, 30 * 60 * 60},
- []uint32{3 * 60 * 60, 30 * 60 * 60},
- },
- }
-
- for _, tc := range cases {
- lts := make([]*LegacyTabletStats, len(tc.input))
- for i, lag := range tc.input {
- lts[i] = &LegacyTabletStats{
- Tablet: topo.NewTablet(uint32(i+1), "cell", fmt.Sprintf("host-%vs-behind", lag)),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: lag},
- }
- }
- got := FilterLegacyStatsByReplicationLag(lts)
- if len(got) != len(tc.output) {
- t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected: %v", tc.description, got, tc.output)
- continue
- }
- for i, elag := range tc.output {
- if got[i].Stats.ReplicationLagSeconds != elag {
- t.Errorf("FilterLegacyStatsByReplicationLag(%v) failed: got output:\n%v\nExpected value index %v to be %v", tc.description, got, i, elag)
- }
- }
- }
-}
-
-func TestFilterLegacyStatsByReplicationLagThreeTabletMin(t *testing.T) {
- // Use at least 3 tablets if possible
- testSetMinNumTablets(3)
- // lags of (1s, 1s, 10m, 11m) - returns at least32 items where the slightly delayed ones that are returned are the 10m and 11m ones.
- ts1 := &LegacyTabletStats{
- Tablet: topo.NewTablet(1, "cell", "host1"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1},
- }
- ts2 := &LegacyTabletStats{
- Tablet: topo.NewTablet(2, "cell", "host2"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1},
- }
- ts3 := &LegacyTabletStats{
- Tablet: topo.NewTablet(3, "cell", "host3"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10 * 60},
- }
- ts4 := &LegacyTabletStats{
- Tablet: topo.NewTablet(4, "cell", "host4"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 11 * 60},
- }
- got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2, ts3, ts4})
- if len(got) != 3 || !got[0].DeepEqual(ts1) || !got[1].DeepEqual(ts2) || !got[2].DeepEqual(ts3) {
- t.Errorf("FilterLegacyStatsByReplicationLag([1s, 1s, 10m, 11m]) = %+v, want [1s, 1s, 10m]", got)
- }
- // lags of (11m, 10m, 1s, 1s) - reordered tablets returns the same 3 items where the slightly delayed one that is returned is the 10m and 11m ones.
- ts1 = &LegacyTabletStats{
- Tablet: topo.NewTablet(1, "cell", "host1"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 11 * 60},
- }
- ts2 = &LegacyTabletStats{
- Tablet: topo.NewTablet(2, "cell", "host2"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10 * 60},
- }
- ts3 = &LegacyTabletStats{
- Tablet: topo.NewTablet(3, "cell", "host3"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1},
- }
- ts4 = &LegacyTabletStats{
- Tablet: topo.NewTablet(4, "cell", "host4"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1},
- }
- got = FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2, ts3, ts4})
- if len(got) != 3 || !got[0].DeepEqual(ts3) || !got[1].DeepEqual(ts4) || !got[2].DeepEqual(ts2) {
- t.Errorf("FilterLegacyStatsByReplicationLag([1s, 1s, 10m, 11m]) = %+v, want [1s, 1s, 10m]", got)
- }
- // Reset to the default
- testSetMinNumTablets(2)
-}
-
-func TestFilterByReplicationLagOneTabletMin(t *testing.T) {
- // Use at least 1 tablets if possible
- testSetMinNumTablets(1)
- // lags of (1s, 100m) - return only healthy tablet if that is all that is available.
- ts1 := &LegacyTabletStats{
- Tablet: topo.NewTablet(1, "cell", "host1"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1},
- }
- ts2 := &LegacyTabletStats{
- Tablet: topo.NewTablet(2, "cell", "host2"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 100 * 60},
- }
- got := FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2})
- if len(got) != 1 || !got[0].DeepEqual(ts1) {
- t.Errorf("FilterLegacyStatsByReplicationLag([1s, 100m]) = %+v, want [1s]", got)
- }
- // lags of (1m, 100m) - return only healthy tablet if that is all that is healthy enough.
- ts1 = &LegacyTabletStats{
- Tablet: topo.NewTablet(1, "cell", "host1"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1 * 60},
- }
- ts2 = &LegacyTabletStats{
- Tablet: topo.NewTablet(2, "cell", "host2"),
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 100 * 60},
- }
- got = FilterLegacyStatsByReplicationLag([]*LegacyTabletStats{ts1, ts2})
- if len(got) != 1 || !got[0].DeepEqual(ts1) {
- t.Errorf("FilterLegacyStatsByReplicationLag([1m, 100m]) = %+v, want [1m]", got)
- }
- // Reset to the default
- testSetMinNumTablets(2)
-}
-
-func TestTrivialLegacyStatsUpdate(t *testing.T) {
- // Note the healthy threshold is set to 30s.
- cases := []struct {
- o uint32
- n uint32
- expected bool
- }{
- // both are under 30s
- {o: 0, n: 1, expected: true},
- {o: 15, n: 20, expected: true},
-
- // one is under 30s, the other isn't
- {o: 2, n: 40, expected: false},
- {o: 40, n: 10, expected: false},
-
- // both are over 30s, but close enough
- {o: 100, n: 100, expected: true},
- {o: 100, n: 105, expected: true},
- {o: 105, n: 100, expected: true},
-
- // both are over 30s, but too far
- {o: 100, n: 120, expected: false},
- {o: 120, n: 100, expected: false},
- }
-
- for _, c := range cases {
- o := &LegacyTabletStats{
- Stats: &querypb.RealtimeStats{
- ReplicationLagSeconds: c.o,
- },
- }
- n := &LegacyTabletStats{
- Stats: &querypb.RealtimeStats{
- ReplicationLagSeconds: c.n,
- },
- }
- got := o.TrivialStatsUpdate(n)
- if got != c.expected {
- t.Errorf("TrivialStatsUpdate(%v, %v) = %v, expected %v", c.o, c.n, got, c.expected)
- }
- }
-}
diff --git a/go/vt/discovery/legacy_tablet_stats_cache.go b/go/vt/discovery/legacy_tablet_stats_cache.go
deleted file mode 100644
index fb642c596a9..00000000000
--- a/go/vt/discovery/legacy_tablet_stats_cache.go
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "sync"
-
- "context"
-
- "vitess.io/vitess/go/vt/log"
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
- "vitess.io/vitess/go/vt/topo"
- "vitess.io/vitess/go/vt/topo/topoproto"
-)
-
-// LegacyTabletStatsCache is a LegacyHealthCheckStatsListener that keeps both the
-// current list of available LegacyTabletStats, and a serving list:
-// - for primary tablets, only the current primary is kept.
-// - for non-primary tablets, we filter the list using FilterLegacyStatsByReplicationLag.
-// It keeps entries for all tablets in the cell(s) it's configured to serve for,
-// and for the primary independently of which cell it's in.
-// Note the healthy tablet computation is done when we receive a tablet
-// update only, not at serving time.
-// Also note the cache may not have the last entry received by the tablet.
-// For instance, if a tablet was healthy, and is still healthy, we do not
-// keep its new update.
-type LegacyTabletStatsCache struct {
- // cell is the cell we are keeping all tablets for.
- // Note we keep track of all primary tablets in all cells.
- cell string
- // ts is the topo server in use.
- ts *topo.Server
- // mu protects the following fields. It does not protect individual
- // entries in the entries map.
- mu sync.RWMutex
- // entries maps from keyspace/shard/tabletType to our cache.
- entries map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry
- // cellAliases is a cache of cell aliases
- cellAliases map[string]string
-}
-
-// legacyTabletStatsCacheEntry is the per keyspace/shard/tabletType
-// entry of the in-memory map for LegacyTabletStatsCache.
-type legacyTabletStatsCacheEntry struct {
- // mu protects the rest of this structure.
- mu sync.RWMutex
- // all has the valid tablets, indexed by TabletToMapKey(ts.Tablet),
- // as it is the index used by LegacyHealthCheck.
- all map[string]*LegacyTabletStats
- // healthy only has the healthy ones.
- healthy []*LegacyTabletStats
-}
-
-func (e *legacyTabletStatsCacheEntry) updateHealthyMapForPrimary(ts *LegacyTabletStats) {
- if ts.Up {
- // We have an Up primary.
- if len(e.healthy) == 0 {
- // We have a new Up server, just remember it.
- e.healthy = append(e.healthy, ts)
- return
- }
-
- // We already have one up server, see if we
- // need to replace it.
- if ts.TabletExternallyReparentedTimestamp < e.healthy[0].TabletExternallyReparentedTimestamp {
- log.Warningf("not marking healthy primary %s as Up for %s because its externally reparented timestamp is smaller than the highest known timestamp from previous MASTERs %s: %d < %d ",
- topoproto.TabletAliasString(ts.Tablet.Alias),
- topoproto.KeyspaceShardString(ts.Target.Keyspace, ts.Target.Shard),
- topoproto.TabletAliasString(e.healthy[0].Tablet.Alias),
- ts.TabletExternallyReparentedTimestamp,
- e.healthy[0].TabletExternallyReparentedTimestamp)
- return
- }
-
- // Just replace it.
- e.healthy[0] = ts
- return
- }
-
- // We have a Down primary, remove it only if it's exactly the same.
- if len(e.healthy) != 0 {
- if ts.Key == e.healthy[0].Key {
- // Same guy, remove it.
- e.healthy = nil
- }
- }
-}
-
-// NewLegacyTabletStatsCache creates a LegacyTabletStatsCache, and registers
-// it as LegacyHealthCheckStatsListener of the provided healthcheck.
-// Note we do the registration in this code to guarantee we call
-// SetListener with sendDownEvents=true, as we need these events
-// to maintain the integrity of our cache.
-func NewLegacyTabletStatsCache(hc LegacyHealthCheck, ts *topo.Server, cell string) *LegacyTabletStatsCache {
- return newLegacyTabletStatsCache(hc, ts, cell, true /* setListener */)
-}
-
-// NewTabletStatsCacheDoNotSetListener is identical to NewLegacyTabletStatsCache
-// but does not automatically set the returned object as listener for "hc".
-// Instead, it's up to the caller to ensure that LegacyTabletStatsCache.StatsUpdate()
-// gets called properly. This is useful for chaining multiple listeners.
-// When the caller sets its own listener on "hc", they must make sure that they
-// set the parameter "sendDownEvents" to "true" or this cache won't properly
-// remove tablets whose tablet type changes.
-func NewTabletStatsCacheDoNotSetListener(ts *topo.Server, cell string) *LegacyTabletStatsCache {
- return newLegacyTabletStatsCache(nil, ts, cell, false /* setListener */)
-}
-
-func newLegacyTabletStatsCache(hc LegacyHealthCheck, ts *topo.Server, cell string, setListener bool) *LegacyTabletStatsCache {
- tc := &LegacyTabletStatsCache{
- cell: cell,
- ts: ts,
- entries: make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry),
- cellAliases: make(map[string]string),
- }
-
- if setListener {
- // We need to set sendDownEvents=true to get the deletes from the map
- // upon type change.
- hc.SetListener(tc, true /*sendDownEvents*/)
- }
- return tc
-}
-
-// getEntry returns an existing legacyTabletStatsCacheEntry in the cache, or nil
-// if the entry does not exist. It only takes a Read lock on mu.
-func (tc *LegacyTabletStatsCache) getEntry(keyspace, shard string, tabletType topodatapb.TabletType) *legacyTabletStatsCacheEntry {
- tc.mu.RLock()
- defer tc.mu.RUnlock()
-
- if s, ok := tc.entries[keyspace]; ok {
- if t, ok := s[shard]; ok {
- if e, ok := t[tabletType]; ok {
- return e
- }
- }
- }
- return nil
-}
-
-// getOrCreateEntry returns an existing legacyTabletStatsCacheEntry from the cache,
-// or creates it if it doesn't exist.
-func (tc *LegacyTabletStatsCache) getOrCreateEntry(target *querypb.Target) *legacyTabletStatsCacheEntry {
- // Fast path (most common path too): Read-lock, return the entry.
- if e := tc.getEntry(target.Keyspace, target.Shard, target.TabletType); e != nil {
- return e
- }
-
- // Slow path: Lock, will probably have to add the entry at some level.
- tc.mu.Lock()
- defer tc.mu.Unlock()
-
- s, ok := tc.entries[target.Keyspace]
- if !ok {
- s = make(map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry)
- tc.entries[target.Keyspace] = s
- }
- t, ok := s[target.Shard]
- if !ok {
- t = make(map[topodatapb.TabletType]*legacyTabletStatsCacheEntry)
- s[target.Shard] = t
- }
- e, ok := t[target.TabletType]
- if !ok {
- e = &legacyTabletStatsCacheEntry{
- all: make(map[string]*LegacyTabletStats),
- }
- t[target.TabletType] = e
- }
- return e
-}
-
-func (tc *LegacyTabletStatsCache) getAliasByCell(cell string) string {
- tc.mu.Lock()
- defer tc.mu.Unlock()
-
- if alias, ok := tc.cellAliases[cell]; ok {
- return alias
- }
-
- alias := topo.GetAliasByCell(context.Background(), tc.ts, cell)
- tc.cellAliases[cell] = alias
-
- return alias
-}
-
-// StatsUpdate is part of the LegacyHealthCheckStatsListener interface.
-func (tc *LegacyTabletStatsCache) StatsUpdate(ts *LegacyTabletStats) {
- if ts.Target.TabletType != topodatapb.TabletType_PRIMARY &&
- ts.Tablet.Alias.Cell != tc.cell &&
- tc.getAliasByCell(ts.Tablet.Alias.Cell) != tc.getAliasByCell(tc.cell) {
- // this is for a non-primary tablet in a different cell and a different alias, drop it
- return
- }
-
- e := tc.getOrCreateEntry(ts.Target)
- e.mu.Lock()
- defer e.mu.Unlock()
-
- // Update our full map.
- trivialNonPrimaryUpdate := false
- if existing, ok := e.all[ts.Key]; ok {
- if ts.Up {
- // We have an existing entry, and a new entry.
- // Remember if they are both good (most common case).
- trivialNonPrimaryUpdate = existing.LastError == nil && existing.Serving && ts.LastError == nil &&
- ts.Serving && ts.Target.TabletType != topodatapb.TabletType_PRIMARY && existing.TrivialStatsUpdate(ts)
-
- // We already have the entry, update the
- // values if necessary. (will update both
- // 'all' and 'healthy' as they use pointers).
- if !trivialNonPrimaryUpdate {
- *existing = *ts
- }
- } else {
- // We have an entry which we shouldn't. Remove it.
- delete(e.all, ts.Key)
- }
- } else {
- if ts.Up {
- // Add the entry.
- e.all[ts.Key] = ts
- } else {
- // We were told to remove an entry which we
- // didn't have anyway, nothing should happen.
- return
- }
- }
-
- // Update our healthy list.
- var allArray []*LegacyTabletStats
- if ts.Target.TabletType == topodatapb.TabletType_PRIMARY {
- // The healthy list is different for TabletType_PRIMARY: we
- // only keep the most recent one.
- e.updateHealthyMapForPrimary(ts)
- } else {
- // For non-primary, if it is a trivial update,
- // we just skip everything else. We don't even update the
- // aggregate stats.
- if trivialNonPrimaryUpdate {
- return
- }
-
- // Now we need to do some work. Recompute our healthy list.
- allArray = make([]*LegacyTabletStats, 0, len(e.all))
- for _, s := range e.all {
- allArray = append(allArray, s)
- }
- e.healthy = FilterLegacyStatsByReplicationLag(allArray)
- }
-}
-
-// GetTabletStats returns the full list of available targets.
-// The returned array is owned by the caller.
-func (tc *LegacyTabletStatsCache) GetTabletStats(keyspace, shard string, tabletType topodatapb.TabletType) []LegacyTabletStats {
- e := tc.getEntry(keyspace, shard, tabletType)
- if e == nil {
- return nil
- }
-
- e.mu.RLock()
- defer e.mu.RUnlock()
- result := make([]LegacyTabletStats, 0, len(e.all))
- for _, s := range e.all {
- result = append(result, *s)
- }
- return result
-}
-
-// GetHealthyTabletStats returns only the healthy targets.
-// The returned array is owned by the caller.
-// For TabletType_PRIMARY, this will only return at most one entry,
-// the most recent tablet of type primary.
-func (tc *LegacyTabletStatsCache) GetHealthyTabletStats(keyspace, shard string, tabletType topodatapb.TabletType) []LegacyTabletStats {
- e := tc.getEntry(keyspace, shard, tabletType)
- if e == nil {
- return nil
- }
-
- e.mu.RLock()
- defer e.mu.RUnlock()
- result := make([]LegacyTabletStats, len(e.healthy))
- for i, ts := range e.healthy {
- result[i] = *ts
- }
- return result
-}
-
-// ResetForTesting is for use in tests only.
-func (tc *LegacyTabletStatsCache) ResetForTesting() {
- tc.mu.Lock()
- defer tc.mu.Unlock()
-
- tc.entries = make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry)
-}
-
-// Compile-time interface check.
-var _ LegacyHealthCheckStatsListener = (*LegacyTabletStatsCache)(nil)
diff --git a/go/vt/discovery/legacy_tablet_stats_cache_test.go b/go/vt/discovery/legacy_tablet_stats_cache_test.go
deleted file mode 100644
index eabbb38ffa5..00000000000
--- a/go/vt/discovery/legacy_tablet_stats_cache_test.go
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "context"
- "testing"
-
- "vitess.io/vitess/go/vt/log"
-
- "vitess.io/vitess/go/vt/topo"
- "vitess.io/vitess/go/vt/topo/memorytopo"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-// TestTabletStatsCache tests the functionality of the LegacyTabletStatsCache class.
-func TestLegacyTabletStatsCache(t *testing.T) {
- ts := memorytopo.NewServer("cell", "cell1", "cell2")
-
- cellsAlias := &topodatapb.CellsAlias{
- Cells: []string{"cell", "cell1"},
- }
-
- if err := ts.CreateCellsAlias(context.Background(), "region1", cellsAlias); err != nil {
- log.Errorf("creating cellsAlias \"region1\" failed: %v", err)
- }
-
- defer deleteCellsAlias(t, ts, "region1")
-
- cellsAlias = &topodatapb.CellsAlias{
- Cells: []string{"cell2"},
- }
-
- if err := ts.CreateCellsAlias(context.Background(), "region2", cellsAlias); err != nil {
- log.Errorf("creating cellsAlias \"region2\" failed: %v", err)
- }
-
- defer deleteCellsAlias(t, ts, "region2")
-
- // We want to unit test LegacyTabletStatsCache without a full-blown
- // LegacyHealthCheck object, so we can't call NewLegacyTabletStatsCache.
- // So we just construct this object here.
- tsc := &LegacyTabletStatsCache{
- cell: "cell",
- ts: ts,
- entries: make(map[string]map[string]map[topodatapb.TabletType]*legacyTabletStatsCacheEntry),
- cellAliases: make(map[string]string),
- }
-
- // empty
- a := tsc.GetTabletStats("k", "s", topodatapb.TabletType_PRIMARY)
- if len(a) != 0 {
- t.Errorf("wrong result, expected empty list: %v", a)
- }
-
- // add a tablet
- tablet1 := topo.NewTablet(10, "cell", "host1")
- ts1 := &LegacyTabletStats{
- Key: "t1",
- Tablet: tablet1,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(ts1)
-
- // check it's there
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // update stats with a change that won't change health array
- stillHealthyTs1 := &LegacyTabletStats{
- Key: "t1",
- Tablet: tablet1,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(stillHealthyTs1)
-
- // check the previous ts1 is still there, as the new one is ignored.
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // update stats with a change that will change arrays
- notHealthyTs1 := &LegacyTabletStats{
- Key: "t1",
- Tablet: tablet1,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(notHealthyTs1)
-
- // check it's there
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !notHealthyTs1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !notHealthyTs1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // add a second tablet
- tablet2 := topo.NewTablet(11, "cell", "host2")
- ts2 := &LegacyTabletStats{
- Key: "t2",
- Tablet: tablet2,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(ts2)
-
- // check it's there
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 2 {
- t.Errorf("unexpected result: %v", a)
- } else {
- if a[0].Tablet.Alias.Uid == 11 {
- a[0], a[1] = a[1], a[0]
- }
- if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) {
- t.Errorf("unexpected result: %v", a)
- }
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 2 {
- t.Errorf("unexpected result: %v", a)
- } else {
- if a[0].Tablet.Alias.Uid == 11 {
- a[0], a[1] = a[1], a[0]
- }
- if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) {
- t.Errorf("unexpected result: %v", a)
- }
- }
-
- // one tablet goes unhealthy
- ts2.Serving = false
- tsc.StatsUpdate(ts2)
-
- // check we only have one left in healthy version
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 2 {
- t.Errorf("unexpected result: %v", a)
- } else {
- if a[0].Tablet.Alias.Uid == 11 {
- a[0], a[1] = a[1], a[0]
- }
- if !ts1.DeepEqual(&a[0]) || !ts2.DeepEqual(&a[1]) {
- t.Errorf("unexpected result: %v", a)
- }
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // second tablet turns into a primary, we receive down + up
- ts2.Serving = true
- ts2.Up = false
- tsc.StatsUpdate(ts2)
- ts2.Up = true
- ts2.Target.TabletType = topodatapb.TabletType_PRIMARY
- ts2.TabletExternallyReparentedTimestamp = 10
- tsc.StatsUpdate(ts2)
-
- // check we only have one replica left
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // check we have a primary now
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_PRIMARY)
- if len(a) != 1 || !ts2.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // reparent: old replica goes into primary
- ts1.Up = false
- tsc.StatsUpdate(ts1)
- ts1.Up = true
- ts1.Target.TabletType = topodatapb.TabletType_PRIMARY
- ts1.TabletExternallyReparentedTimestamp = 20
- tsc.StatsUpdate(ts1)
-
- // check we lost all replicas, and primary is new one
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 0 {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_PRIMARY)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // old primary sending an old ping should be ignored
- tsc.StatsUpdate(ts2)
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_PRIMARY)
- if len(a) != 1 || !ts1.DeepEqual(&a[0]) {
- t.Errorf("unexpected result: %v", a)
- }
-
- // add a third tablet as replica in diff cell, same region
- tablet3 := topo.NewTablet(12, "cell1", "host3")
- ts3 := &LegacyTabletStats{
- Key: "t3",
- Tablet: tablet3,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(ts3)
- // check it's there
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 {
- t.Errorf("unexpected result: %v", a)
- }
-
- // add a 4th replica tablet in a diff cell, diff region
- tablet4 := topo.NewTablet(13, "cell2", "host4")
- ts4 := &LegacyTabletStats{
- Key: "t4",
- Tablet: tablet4,
- Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
- Up: true,
- Serving: true,
- Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
- }
- tsc.StatsUpdate(ts4)
- // check it's *NOT* there
- a = tsc.GetTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 {
- t.Errorf("unexpected result: %v", a)
- }
- a = tsc.GetHealthyTabletStats("k", "s", topodatapb.TabletType_REPLICA)
- if len(a) != 1 {
- t.Errorf("unexpected result: %v", a)
- }
-}
diff --git a/go/vt/discovery/legacy_tablet_stats_cache_wait.go b/go/vt/discovery/legacy_tablet_stats_cache_wait.go
deleted file mode 100644
index 8590051be88..00000000000
--- a/go/vt/discovery/legacy_tablet_stats_cache_wait.go
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "time"
-
- "context"
-
- "vitess.io/vitess/go/vt/log"
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-var (
- // How much to sleep between each check.
- waitAvailableTabletInterval = 100 * time.Millisecond
-)
-
-// WaitForTablets waits for at least one tablet in the given
-// keyspace / shard / tablet type before returning. The tablets do not
-// have to be healthy. It will return ctx.Err() if the context is canceled.
-func (tc *LegacyTabletStatsCache) WaitForTablets(ctx context.Context, keyspace, shard string, tabletType topodatapb.TabletType) error {
- targets := []*querypb.Target{
- {
- Keyspace: keyspace,
- Shard: shard,
- TabletType: tabletType,
- },
- }
- return tc.waitForTablets(ctx, targets, false)
-}
-
-// WaitForAllServingTablets waits for at least one healthy serving tablet in
-// each given target before returning.
-// It will return ctx.Err() if the context is canceled.
-// It will return an error if it can't read the necessary topology records.
-func (tc *LegacyTabletStatsCache) WaitForAllServingTablets(ctx context.Context, targets []*querypb.Target) error {
- return tc.waitForTablets(ctx, targets, true)
-}
-
-// waitForTablets is the internal method that polls for tablets.
-func (tc *LegacyTabletStatsCache) waitForTablets(ctx context.Context, targets []*querypb.Target, requireServing bool) error {
- for {
- // We nil targets as we find them.
- allPresent := true
- for i, target := range targets {
- if target == nil {
- continue
- }
-
- var stats []LegacyTabletStats
- if requireServing {
- stats = tc.GetHealthyTabletStats(target.Keyspace, target.Shard, target.TabletType)
- } else {
- stats = tc.GetTabletStats(target.Keyspace, target.Shard, target.TabletType)
- }
- if len(stats) == 0 {
- allPresent = false
- } else {
- targets[i] = nil
- }
- }
-
- if allPresent {
- // we found everything we needed
- return nil
- }
-
- // Unblock after the sleep or when the context has expired.
- timer := time.NewTimer(waitAvailableTabletInterval)
- select {
- case <-ctx.Done():
- for _, target := range targets {
- if target != nil {
- log.Infof("couldn't find tablets for target: %v", target)
- }
- }
- timer.Stop()
- return ctx.Err()
- case <-timer.C:
- }
- }
-}
-
-// WaitByFilter waits for at least one tablet based on the filter function.
-func (tc *LegacyTabletStatsCache) WaitByFilter(ctx context.Context, keyspace, shard string, tabletTypes []topodatapb.TabletType, filter func([]LegacyTabletStats) []LegacyTabletStats) error {
- for {
- for _, tt := range tabletTypes {
- stats := tc.GetTabletStats(keyspace, shard, tt)
- stats = filter(stats)
- if len(stats) > 0 {
- return nil
- }
- }
-
- // Unblock after the sleep or when the context has expired.
- timer := time.NewTimer(waitAvailableTabletInterval)
- select {
- case <-ctx.Done():
- timer.Stop()
- return ctx.Err()
- case <-timer.C:
- }
- }
-}
diff --git a/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go b/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go
deleted file mode 100644
index ff29c0e3570..00000000000
--- a/go/vt/discovery/legacy_tablet_stats_cache_wait_test.go
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "testing"
- "time"
-
- "context"
-
- "vitess.io/vitess/go/vt/topo"
-
- querypb "vitess.io/vitess/go/vt/proto/query"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-func TestWaitForTablets(t *testing.T) {
- shortCtx, shortCancel := context.WithTimeout(context.Background(), 10*time.Millisecond)
- defer shortCancel()
- waitAvailableTabletInterval = 20 * time.Millisecond
-
- tablet := topo.NewTablet(0, "cell", "a")
- tablet.PortMap["vt"] = 1
- input := make(chan *querypb.StreamHealthResponse)
- createFakeConn(tablet, input)
-
- hc := NewLegacyHealthCheck(1*time.Millisecond, 1*time.Hour)
- tsc := NewLegacyTabletStatsCache(hc, nil, "cell")
- hc.AddTablet(tablet, "")
-
- // this should time out
- if err := tsc.WaitForTablets(shortCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err != context.DeadlineExceeded {
- t.Errorf("got wrong error: %v", err)
- }
-
- // this should fail, but return a non-timeout error
- cancelledCtx, cancel := context.WithCancel(context.Background())
- cancel()
- if err := tsc.WaitForTablets(cancelledCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err == nil || err == context.DeadlineExceeded {
- t.Errorf("want: non-timeout error, got: %v", err)
- }
-
- // send the tablet in
- shr := &querypb.StreamHealthResponse{
- Target: &querypb.Target{
- Keyspace: "keyspace",
- Shard: "shard",
- TabletType: topodatapb.TabletType_REPLICA,
- },
- Serving: true,
- RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
- }
- input <- shr
-
- // and ask again, with longer time outs so it's not flaky
- longCtx, longCancel := context.WithTimeout(context.Background(), 10*time.Second)
- defer longCancel()
- waitAvailableTabletInterval = 10 * time.Millisecond
- if err := tsc.WaitForTablets(longCtx, "keyspace", "shard", topodatapb.TabletType_REPLICA); err != nil {
- t.Errorf("got error: %v", err)
- }
-}
diff --git a/go/vt/discovery/legacy_topology_watcher.go b/go/vt/discovery/legacy_topology_watcher.go
deleted file mode 100644
index 194396df0c1..00000000000
--- a/go/vt/discovery/legacy_topology_watcher.go
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "bytes"
- "fmt"
- "hash/crc32"
- "sort"
- "strings"
- "sync"
- "time"
-
- "context"
-
- "vitess.io/vitess/go/trace"
-
- "vitess.io/vitess/go/vt/key"
- "vitess.io/vitess/go/vt/log"
- "vitess.io/vitess/go/vt/topo"
- "vitess.io/vitess/go/vt/topo/topoproto"
-
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-)
-
-// tabletInfo is used internally by the TopologyWatcher class
-type legacyTabletInfo struct {
- alias string
- key string
- tablet *topodatapb.Tablet
-}
-
-// NewLegacyCellTabletsWatcher returns a LegacyTopologyWatcher that monitors all
-// the tablets in a cell, and starts refreshing.
-func NewLegacyCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *LegacyTopologyWatcher {
- return NewLegacyTopologyWatcher(ctx, topoServer, tr, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error) {
- return tw.topoServer.GetTabletAliasesByCell(ctx, tw.cell)
- })
-}
-
-// NewLegacyShardReplicationWatcher returns a LegacyTopologyWatcher that
-// monitors the tablets in a cell/keyspace/shard, and starts refreshing.
-func NewLegacyShardReplicationWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) *LegacyTopologyWatcher {
- return NewLegacyTopologyWatcher(ctx, topoServer, tr, cell, refreshInterval, true /* refreshKnownTablets */, topoReadConcurrency, func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error) {
- sri, err := tw.topoServer.GetShardReplication(ctx, tw.cell, keyspace, shard)
- switch {
- case err == nil:
- // we handle this case after this switch block
- case topo.IsErrType(err, topo.NoNode):
- // this is not an error
- return nil, nil
- default:
- return nil, err
- }
-
- result := make([]*topodatapb.TabletAlias, len(sri.Nodes))
- for i, node := range sri.Nodes {
- result[i] = node.TabletAlias
- }
- return result, nil
- })
-}
-
-// LegacyTopologyWatcher polls tablet from a configurable set of tablets
-// periodically. When tablets are added / removed, it calls
-// the LegacyTabletRecorder AddTablet / RemoveTablet interface appropriately.
-type LegacyTopologyWatcher struct {
- // set at construction time
- topoServer *topo.Server
- tr LegacyTabletRecorder
- cell string
- refreshInterval time.Duration
- refreshKnownTablets bool
- getTablets func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error)
- sem chan int
- ctx context.Context
- cancelFunc context.CancelFunc
- // wg keeps track of all launched Go routines.
- wg sync.WaitGroup
-
- // mu protects all variables below
- mu sync.Mutex
- // tablets contains a map of alias -> tabletInfo for all known tablets
- tablets map[string]*legacyTabletInfo
- // topoChecksum stores a crc32 of the tablets map and is exported as a metric
- topoChecksum uint32
- // lastRefresh records the timestamp of the last topo refresh
- lastRefresh time.Time
- // firstLoadDone is true when first load of the topology data is done.
- firstLoadDone bool
- // firstLoadChan is closed when the initial loading of topology data is done.
- firstLoadChan chan struct{}
-}
-
-// NewLegacyTopologyWatcher returns a LegacyTopologyWatcher that monitors all
-// the tablets in a cell, and starts refreshing.
-func NewLegacyTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr LegacyTabletRecorder, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *LegacyTopologyWatcher) ([]*topodatapb.TabletAlias, error)) *LegacyTopologyWatcher {
- tw := &LegacyTopologyWatcher{
- topoServer: topoServer,
- tr: tr,
- cell: cell,
- refreshInterval: refreshInterval,
- refreshKnownTablets: refreshKnownTablets,
- getTablets: getTablets,
- sem: make(chan int, topoReadConcurrency),
- tablets: make(map[string]*legacyTabletInfo),
- }
- tw.firstLoadChan = make(chan struct{})
-
- // We want the span from the context, but not the cancelation that comes with it
- spanContext := trace.CopySpan(context.Background(), ctx)
- tw.ctx, tw.cancelFunc = context.WithCancel(spanContext)
- tw.wg.Add(1)
- go tw.watch()
- return tw
-}
-
-// watch polls all tablets and notifies LegacyTabletRecorder by adding/removing tablets.
-func (tw *LegacyTopologyWatcher) watch() {
- defer tw.wg.Done()
- ticker := time.NewTicker(tw.refreshInterval)
- defer ticker.Stop()
- for {
- tw.loadTablets()
- select {
- case <-tw.ctx.Done():
- return
- case <-ticker.C:
- }
- }
-}
-
-// loadTablets reads all tablets from topology, and updates LegacyTabletRecorder.
-func (tw *LegacyTopologyWatcher) loadTablets() {
- var wg sync.WaitGroup
- newTablets := make(map[string]*legacyTabletInfo)
- replacedTablets := make(map[string]*legacyTabletInfo)
-
- tabletAliases, err := tw.getTablets(tw)
- topologyWatcherOperations.Add(topologyWatcherOpListTablets, 1)
- if err != nil {
- topologyWatcherErrors.Add(topologyWatcherOpListTablets, 1)
- select {
- case <-tw.ctx.Done():
- return
- default:
- }
- log.Errorf("cannot get tablets for cell: %v: %v", tw.cell, err)
- return
- }
-
- // Accumulate a list of all known alias strings to use later
- // when sorting
- tabletAliasStrs := make([]string, 0, len(tabletAliases))
-
- tw.mu.Lock()
- for _, tAlias := range tabletAliases {
- aliasStr := topoproto.TabletAliasString(tAlias)
- tabletAliasStrs = append(tabletAliasStrs, aliasStr)
-
- if !tw.refreshKnownTablets {
- if val, ok := tw.tablets[aliasStr]; ok {
- newTablets[aliasStr] = val
- continue
- }
- }
-
- wg.Add(1)
- go func(alias *topodatapb.TabletAlias) {
- defer wg.Done()
- tw.sem <- 1 // Wait for active queue to drain.
- tablet, err := tw.topoServer.GetTablet(tw.ctx, alias)
- topologyWatcherOperations.Add(topologyWatcherOpGetTablet, 1)
- <-tw.sem // Done; enable next request to run
- if err != nil {
- topologyWatcherErrors.Add(topologyWatcherOpGetTablet, 1)
- select {
- case <-tw.ctx.Done():
- return
- default:
- }
- log.Errorf("cannot get tablet for alias %v: %v", alias, err)
- return
- }
- tw.mu.Lock()
- aliasStr := topoproto.TabletAliasString(alias)
- newTablets[aliasStr] = &legacyTabletInfo{
- alias: aliasStr,
- key: TabletToMapKey(tablet.Tablet),
- tablet: tablet.Tablet,
- }
- tw.mu.Unlock()
- }(tAlias)
- }
-
- tw.mu.Unlock()
- wg.Wait()
- tw.mu.Lock()
-
- for alias, newVal := range newTablets {
- if val, ok := tw.tablets[alias]; !ok {
- // Check if there's a tablet with the same address key but a
- // different alias. If so, replace it and keep track of the
- // replaced alias to make sure it isn't removed later.
- found := false
- for _, otherVal := range tw.tablets {
- if newVal.key == otherVal.key {
- found = true
- tw.tr.ReplaceTablet(otherVal.tablet, newVal.tablet, alias)
- topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1)
- replacedTablets[otherVal.alias] = newVal
- }
- }
- if !found {
- tw.tr.AddTablet(newVal.tablet, alias)
- topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1)
- }
-
- } else if val.key != newVal.key {
- // Handle the case where the same tablet alias is now reporting
- // a different address key.
- replacedTablets[alias] = newVal
- tw.tr.ReplaceTablet(val.tablet, newVal.tablet, alias)
- topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1)
- }
- }
-
- for _, val := range tw.tablets {
- if _, ok := newTablets[val.alias]; !ok {
- if _, ok2 := replacedTablets[val.alias]; !ok2 {
- tw.tr.RemoveTablet(val.tablet)
- topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
- }
- }
- }
- tw.tablets = newTablets
- if !tw.firstLoadDone {
- tw.firstLoadDone = true
- close(tw.firstLoadChan)
- }
-
- // iterate through the tablets in a stable order and compute a
- // checksum of the tablet map
- sort.Strings(tabletAliasStrs)
- var buf bytes.Buffer
- for _, alias := range tabletAliasStrs {
- tabletInfo, ok := tw.tablets[alias]
- if ok {
- buf.WriteString(alias)
- buf.WriteString(tabletInfo.key)
- }
- }
- tw.topoChecksum = crc32.ChecksumIEEE(buf.Bytes())
- tw.lastRefresh = time.Now()
-
- tw.mu.Unlock()
-}
-
-// WaitForInitialTopology waits until the watcher reads all of the topology data
-// for the first time and transfers the information to LegacyTabletRecorder via its
-// AddTablet() method.
-func (tw *LegacyTopologyWatcher) WaitForInitialTopology() error {
- select {
- case <-tw.ctx.Done():
- return tw.ctx.Err()
- case <-tw.firstLoadChan:
- return nil
- }
-}
-
-// Stop stops the watcher. It does not clean up the tablets added to LegacyTabletRecorder.
-func (tw *LegacyTopologyWatcher) Stop() {
- tw.cancelFunc()
- // wait for watch goroutine to finish.
- tw.wg.Wait()
-}
-
-// RefreshLag returns the time since the last refresh
-func (tw *LegacyTopologyWatcher) RefreshLag() time.Duration {
- tw.mu.Lock()
- defer tw.mu.Unlock()
-
- return time.Since(tw.lastRefresh)
-}
-
-// TopoChecksum returns the checksum of the current state of the topo
-func (tw *LegacyTopologyWatcher) TopoChecksum() uint32 {
- tw.mu.Lock()
- defer tw.mu.Unlock()
-
- return tw.topoChecksum
-}
-
-// LegacyFilterByShard is a LegacyTabletRecorder filter that filters tablets by
-// keyspace/shard.
-type LegacyFilterByShard struct {
- // tr is the underlying LegacyTabletRecorder to forward requests too
- tr LegacyTabletRecorder
-
- // filters is a map of keyspace to filters for shards
- filters map[string][]*filterShard
-}
-
-// NewLegacyFilterByShard creates a new LegacyFilterByShard on top of an existing
-// LegacyTabletRecorder. Each filter is a keyspace|shard entry, where shard
-// can either be a shard name, or a keyrange. All tablets that match
-// at least one keyspace|shard tuple will be forwarded to the
-// underlying LegacyTabletRecorder.
-func NewLegacyFilterByShard(tr LegacyTabletRecorder, filters []string) (*LegacyFilterByShard, error) {
- m := make(map[string][]*filterShard)
- for _, filter := range filters {
- parts := strings.Split(filter, "|")
- if len(parts) != 2 {
- return nil, fmt.Errorf("invalid LegacyFilterByShard parameter: %v", filter)
- }
-
- keyspace := parts[0]
- shard := parts[1]
-
- // extract keyrange if it's a range
- canonical, kr, err := topo.ValidateShardName(shard)
- if err != nil {
- return nil, fmt.Errorf("error parsing shard name %v: %v", shard, err)
- }
-
- // check for duplicates
- for _, c := range m[keyspace] {
- if c.shard == canonical {
- return nil, fmt.Errorf("duplicate %v/%v entry", keyspace, shard)
- }
- }
-
- m[keyspace] = append(m[keyspace], &filterShard{
- keyspace: keyspace,
- shard: canonical,
- keyRange: kr,
- })
- }
-
- return &LegacyFilterByShard{
- tr: tr,
- filters: m,
- }, nil
-}
-
-// AddTablet is part of the LegacyTabletRecorder interface.
-func (fbs *LegacyFilterByShard) AddTablet(tablet *topodatapb.Tablet, name string) {
- if fbs.isIncluded(tablet) {
- fbs.tr.AddTablet(tablet, name)
- }
-}
-
-// RemoveTablet is part of the LegacyTabletRecorder interface.
-func (fbs *LegacyFilterByShard) RemoveTablet(tablet *topodatapb.Tablet) {
- if fbs.isIncluded(tablet) {
- fbs.tr.RemoveTablet(tablet)
- }
-}
-
-// ReplaceTablet is part of the LegacyTabletRecorder interface.
-func (fbs *LegacyFilterByShard) ReplaceTablet(old, new *topodatapb.Tablet, name string) {
- if fbs.isIncluded(old) && fbs.isIncluded(new) {
- fbs.tr.ReplaceTablet(old, new, name)
- }
-}
-
-// isIncluded returns true iff the tablet's keyspace and shard should be
-// forwarded to the underlying LegacyTabletRecorder.
-func (fbs *LegacyFilterByShard) isIncluded(tablet *topodatapb.Tablet) bool {
- canonical, kr, err := topo.ValidateShardName(tablet.Shard)
- if err != nil {
- log.Errorf("Error parsing shard name %v, will ignore tablet: %v", tablet.Shard, err)
- return false
- }
-
- for _, c := range fbs.filters[tablet.Keyspace] {
- if canonical == c.shard {
- // Exact match (probably a non-sharded keyspace).
- return true
- }
- if kr != nil && c.keyRange != nil && key.KeyRangeIncludes(c.keyRange, kr) {
- // Our filter's KeyRange includes the provided KeyRange
- return true
- }
- }
- return false
-}
-
-// LegacyFilterByKeyspace is a LegacyTabletRecorder filter that filters tablets by
-// keyspace
-type LegacyFilterByKeyspace struct {
- tr LegacyTabletRecorder
-
- keyspaces map[string]bool
-}
-
-// NewLegacyFilterByKeyspace creates a new LegacyFilterByKeyspace on top of an existing
-// LegacyTabletRecorder. Each filter is a keyspace entry. All tablets that match
-// a keyspace will be forwarded to the underlying LegacyTabletRecorder.
-func NewLegacyFilterByKeyspace(tr LegacyTabletRecorder, selectedKeyspaces []string) *LegacyFilterByKeyspace {
- m := make(map[string]bool)
- for _, keyspace := range selectedKeyspaces {
- m[keyspace] = true
- }
-
- return &LegacyFilterByKeyspace{
- tr: tr,
- keyspaces: m,
- }
-}
-
-// AddTablet is part of the LegacyTabletRecorder interface.
-func (fbk *LegacyFilterByKeyspace) AddTablet(tablet *topodatapb.Tablet, name string) {
- if fbk.isIncluded(tablet) {
- fbk.tr.AddTablet(tablet, name)
- }
-}
-
-// RemoveTablet is part of the LegacyTabletRecorder interface.
-func (fbk *LegacyFilterByKeyspace) RemoveTablet(tablet *topodatapb.Tablet) {
- if fbk.isIncluded(tablet) {
- fbk.tr.RemoveTablet(tablet)
- }
-}
-
-// ReplaceTablet is part of the LegacyTabletRecorder interface.
-func (fbk *LegacyFilterByKeyspace) ReplaceTablet(old *topodatapb.Tablet, new *topodatapb.Tablet, name string) {
- if old.Keyspace != new.Keyspace {
- log.Errorf("Error replacing old tablet in %v with new tablet in %v", old.Keyspace, new.Keyspace)
- return
- }
-
- if fbk.isIncluded(new) {
- fbk.tr.ReplaceTablet(old, new, name)
- }
-}
-
-// isIncluded returns true if the tablet's keyspace should be
-// forwarded to the underlying LegacyTabletRecorder.
-func (fbk *LegacyFilterByKeyspace) isIncluded(tablet *topodatapb.Tablet) bool {
- _, exist := fbk.keyspaces[tablet.Keyspace]
- return exist
-}
diff --git a/go/vt/discovery/legacy_topology_watcher_test.go b/go/vt/discovery/legacy_topology_watcher_test.go
deleted file mode 100644
index ed831eddeda..00000000000
--- a/go/vt/discovery/legacy_topology_watcher_test.go
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
-Copyright 2019 The Vitess Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package discovery
-
-import (
- "math/rand"
- "testing"
- "time"
-
- "context"
-
- "google.golang.org/protobuf/proto"
-
- "vitess.io/vitess/go/vt/logutil"
- topodatapb "vitess.io/vitess/go/vt/proto/topodata"
- "vitess.io/vitess/go/vt/topo"
- "vitess.io/vitess/go/vt/topo/memorytopo"
-)
-
-func checkLegacyOpCounts(t *testing.T, tw *LegacyTopologyWatcher, prevCounts, deltas map[string]int64) map[string]int64 {
- t.Helper()
- newCounts := topologyWatcherOperations.Counts()
- for key, prevVal := range prevCounts {
- delta, ok := deltas[key]
- if !ok {
- delta = 0
- }
- newVal, ok := newCounts[key]
- if !ok {
- newVal = 0
- }
-
- if newVal != prevVal+delta {
- t.Errorf("expected %v to increase by %v, got %v -> %v", key, delta, prevVal, newVal)
- }
- }
- return newCounts
-}
-
-func checkLegacyChecksum(t *testing.T, tw *LegacyTopologyWatcher, want uint32) {
- t.Helper()
- got := tw.TopoChecksum()
- if want != got {
- t.Errorf("want checksum %v got %v", want, got)
- }
-}
-
-func TestLegacyCellTabletsWatcher(t *testing.T) {
- checkLegacyWatcher(t, true, true)
-}
-
-func TestLegacyCellTabletsWatcherNoRefreshKnown(t *testing.T) {
- checkLegacyWatcher(t, true, false)
-}
-
-func TestLegacyShardReplicationWatcher(t *testing.T) {
- checkLegacyWatcher(t, false, true)
-}
-
-func checkLegacyWatcher(t *testing.T, cellTablets, refreshKnownTablets bool) {
- ts := memorytopo.NewServer("aa")
- fhc := NewFakeLegacyHealthCheck()
- logger := logutil.NewMemoryLogger()
- topologyWatcherOperations.ZeroAll()
- counts := topologyWatcherOperations.Counts()
- var tw *LegacyTopologyWatcher
- if cellTablets {
- tw = NewLegacyCellTabletsWatcher(context.Background(), ts, fhc, "aa", 10*time.Minute, refreshKnownTablets, 5)
- } else {
- tw = NewLegacyShardReplicationWatcher(context.Background(), ts, fhc, "aa", "keyspace", "shard", 10*time.Minute, 5)
- }
-
- // Wait for the initial topology load to finish. Otherwise we
- // have a background loadTablets() that's running, and it can
- // interact with our tests in weird ways.
- if err := tw.WaitForInitialTopology(); err != nil {
- t.Fatalf("initial WaitForInitialTopology failed")
- }
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1})
- checkLegacyChecksum(t, tw, 0)
-
- // Add a tablet to the topology.
- tablet := &topodatapb.Tablet{
- Alias: &topodatapb.TabletAlias{
- Cell: "aa",
- Uid: 0,
- },
- Hostname: "host1",
- PortMap: map[string]int32{
- "vt": 123,
- },
- Keyspace: "keyspace",
- Shard: "shard",
- }
- if err := ts.CreateTablet(context.Background(), tablet); err != nil {
- t.Fatalf("CreateTablet failed: %v", err)
- }
- tw.loadTablets()
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1})
- checkLegacyChecksum(t, tw, 1261153186)
-
- // Check the tablet is returned by GetAllTablets().
- allTablets := fhc.GetAllTablets()
- key := TabletToMapKey(tablet)
- if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet)
- }
-
- // Add a second tablet to the topology.
- tablet2 := &topodatapb.Tablet{
- Alias: &topodatapb.TabletAlias{
- Cell: "aa",
- Uid: 2,
- },
- Hostname: "host2",
- PortMap: map[string]int32{
- "vt": 789,
- },
- Keyspace: "keyspace",
- Shard: "shard",
- }
- if err := ts.CreateTablet(context.Background(), tablet2); err != nil {
- t.Fatalf("CreateTablet failed: %v", err)
- }
- tw.loadTablets()
-
- // If refreshKnownTablets is disabled, only the new tablet is read
- // from the topo
- if refreshKnownTablets {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "AddTablet": 1})
- } else {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "AddTablet": 1})
- }
- checkLegacyChecksum(t, tw, 832404892)
-
- // Check the new tablet is returned by GetAllTablets().
- allTablets = fhc.GetAllTablets()
- key = TabletToMapKey(tablet2)
- if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet2) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet2)
- }
-
- // Load the tablets again to show that when refreshKnownTablets is disabled,
- // only the list is read from the topo and the checksum doesn't change
- tw.loadTablets()
- if refreshKnownTablets {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2})
- } else {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1})
- }
- checkLegacyChecksum(t, tw, 832404892)
-
- // same tablet, different port, should update (previous
- // one should go away, new one be added)
- //
- // if refreshKnownTablets is disabled, this case is *not*
- // detected and the tablet remains in the topo using the
- // old key
- origTablet := proto.Clone(tablet).(*topodatapb.Tablet)
- origKey := TabletToMapKey(tablet)
- tablet.PortMap["vt"] = 456
- if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error {
- t.PortMap["vt"] = 456
- return nil
- }); err != nil {
- t.Fatalf("UpdateTabletFields failed: %v", err)
- }
- tw.loadTablets()
- allTablets = fhc.GetAllTablets()
- key = TabletToMapKey(tablet)
-
- if refreshKnownTablets {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 1})
-
- if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet)
- }
- if _, ok := allTablets[origKey]; ok {
- t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, origKey)
- }
- checkLegacyChecksum(t, tw, 698548794)
- } else {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1})
-
- if _, ok := allTablets[origKey]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[origKey], origTablet) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, origTablet)
- }
- if _, ok := allTablets[key]; ok {
- t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key)
- }
- checkLegacyChecksum(t, tw, 832404892)
- }
-
- // Remove the second tablet and re-add with a new uid. This should
- // trigger a ReplaceTablet in loadTablets because the uid does not
- // match.
- //
- // This case *is* detected even if refreshKnownTablets is false
- // because the delete tablet / create tablet sequence causes the
- // list of tablets to change and therefore the change is detected.
- if err := ts.DeleteTablet(context.Background(), tablet2.Alias); err != nil {
- t.Fatalf("DeleteTablet failed: %v", err)
- }
- tablet2.Alias.Uid = 3
- if err := ts.CreateTablet(context.Background(), tablet2); err != nil {
- t.Fatalf("CreateTablet failed: %v", err)
- }
- if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil {
- t.Fatalf("FixShardReplication failed: %v", err)
- }
- tw.loadTablets()
- allTablets = fhc.GetAllTablets()
-
- if refreshKnownTablets {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 1})
- checkLegacyChecksum(t, tw, 4097170367)
- } else {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "ReplaceTablet": 1})
- checkLegacyChecksum(t, tw, 3960185881)
- }
- key = TabletToMapKey(tablet2)
- if _, ok := allTablets[key]; !ok || len(allTablets) != 2 || !proto.Equal(allTablets[key], tablet2) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %v => %+v", allTablets, key, tablet2)
- }
-
- // Both tablets restart on different hosts.
- // tablet2 happens to land on the host:port that tablet 1 used to be on.
- // This can only be tested when we refresh known tablets.
- if refreshKnownTablets {
- origTablet := proto.Clone(tablet).(*topodatapb.Tablet)
- origTablet2 := proto.Clone(tablet2).(*topodatapb.Tablet)
-
- if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error {
- t.Hostname = tablet.Hostname
- t.PortMap = tablet.PortMap
- tablet2 = t
- return nil
- }); err != nil {
- t.Fatalf("UpdateTabletFields failed: %v", err)
- }
- if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error {
- t.Hostname = "host3"
- tablet = t
- return nil
- }); err != nil {
- t.Fatalf("UpdateTabletFields failed: %v", err)
- }
- tw.loadTablets()
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2})
- allTablets = fhc.GetAllTablets()
- key2 := TabletToMapKey(tablet2)
- if _, ok := allTablets[key2]; !ok {
- t.Fatalf("tablet was lost because it's reusing an address recently used by another tablet: %v", key2)
- }
-
- // Change tablets back to avoid altering later tests.
- if _, err := ts.UpdateTabletFields(context.Background(), tablet2.Alias, func(t *topodatapb.Tablet) error {
- t.Hostname = origTablet2.Hostname
- t.PortMap = origTablet2.PortMap
- tablet2 = t
- return nil
- }); err != nil {
- t.Fatalf("UpdateTabletFields failed: %v", err)
- }
- if _, err := ts.UpdateTabletFields(context.Background(), tablet.Alias, func(t *topodatapb.Tablet) error {
- t.Hostname = origTablet.Hostname
- tablet = t
- return nil
- }); err != nil {
- t.Fatalf("UpdateTabletFields failed: %v", err)
- }
- tw.loadTablets()
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 2, "ReplaceTablet": 2})
- }
-
- // Remove the tablet and check that it is detected as being gone.
- if err := ts.DeleteTablet(context.Background(), tablet.Alias); err != nil {
- t.Fatalf("DeleteTablet failed: %v", err)
- }
- if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil {
- t.Fatalf("FixShardReplication failed: %v", err)
- }
- tw.loadTablets()
- if refreshKnownTablets {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 1, "RemoveTablet": 1})
- } else {
- counts = checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "RemoveTablet": 1})
- }
- checkLegacyChecksum(t, tw, 1725545897)
-
- allTablets = fhc.GetAllTablets()
- key = TabletToMapKey(tablet)
- if _, ok := allTablets[key]; ok || len(allTablets) != 1 {
- t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key)
- }
- key = TabletToMapKey(tablet2)
- if _, ok := allTablets[key]; !ok || len(allTablets) != 1 || !proto.Equal(allTablets[key], tablet2) {
- t.Errorf("fhc.GetAllTablets() = %+v; want %+v", allTablets, tablet2)
- }
-
- // Remove the other and check that it is detected as being gone.
- if err := ts.DeleteTablet(context.Background(), tablet2.Alias); err != nil {
- t.Fatalf("DeleteTablet failed: %v", err)
- }
- if _, err := topo.FixShardReplication(context.Background(), ts, logger, "aa", "keyspace", "shard"); err != nil {
- t.Fatalf("FixShardReplication failed: %v", err)
- }
- tw.loadTablets()
- checkLegacyOpCounts(t, tw, counts, map[string]int64{"ListTablets": 1, "GetTablet": 0, "RemoveTablet": 1})
- checkLegacyChecksum(t, tw, 0)
-
- allTablets = fhc.GetAllTablets()
- key = TabletToMapKey(tablet)
- if _, ok := allTablets[key]; ok || len(allTablets) != 0 {
- t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key)
- }
- key = TabletToMapKey(tablet2)
- if _, ok := allTablets[key]; ok || len(allTablets) != 0 {
- t.Errorf("fhc.GetAllTablets() = %+v; don't want %v", allTablets, key)
- }
-
- tw.Stop()
-}
-
-func TestLegacyFilterByShard(t *testing.T) {
- testcases := []struct {
- filters []string
- keyspace string
- shard string
- included bool
- }{
- // un-sharded keyspaces
- {
- filters: []string{"ks1|0"},
- keyspace: "ks1",
- shard: "0",
- included: true,
- },
- {
- filters: []string{"ks1|0"},
- keyspace: "ks2",
- shard: "0",
- included: false,
- },
- // custom sharding, different shard
- {
- filters: []string{"ks1|0"},
- keyspace: "ks1",
- shard: "1",
- included: false,
- },
- // keyrange based sharding
- {
- filters: []string{"ks1|-80"},
- keyspace: "ks1",
- shard: "0",
- included: false,
- },
- {
- filters: []string{"ks1|-80"},
- keyspace: "ks1",
- shard: "-40",
- included: true,
- },
- {
- filters: []string{"ks1|-80"},
- keyspace: "ks1",
- shard: "-80",
- included: true,
- },
- {
- filters: []string{"ks1|-80"},
- keyspace: "ks1",
- shard: "80-",
- included: false,
- },
- {
- filters: []string{"ks1|-80"},
- keyspace: "ks1",
- shard: "c0-",
- included: false,
- },
- }
-
- for _, tc := range testcases {
- fbs, err := NewLegacyFilterByShard(nil, tc.filters)
- if err != nil {
- t.Errorf("cannot create LegacyFilterByShard for filters %v: %v", tc.filters, err)
- }
-
- tablet := &topodatapb.Tablet{
- Keyspace: tc.keyspace,
- Shard: tc.shard,
- }
-
- got := fbs.isIncluded(tablet)
- if got != tc.included {
- t.Errorf("isIncluded(%v,%v) for filters %v returned %v but expected %v", tc.keyspace, tc.shard, tc.filters, got, tc.included)
- }
- }
-}
-
-func TestLegacyFilterByKeyspace(t *testing.T) {
- hc := NewFakeLegacyHealthCheck()
- tr := NewLegacyFilterByKeyspace(hc, testKeyspacesToWatch)
- ts := memorytopo.NewServer(testCell)
- tw := NewLegacyCellTabletsWatcher(context.Background(), ts, tr, testCell, 10*time.Minute, true, 5)
-
- for _, test := range testFilterByKeyspace {
- // Add a new tablet to the topology.
- port := rand.Int31n(1000)
- tablet := &topodatapb.Tablet{
- Alias: &topodatapb.TabletAlias{
- Cell: testCell,
- Uid: rand.Uint32(),
- },
- Hostname: testHostName,
- PortMap: map[string]int32{
- "vt": port,
- },
- Keyspace: test.keyspace,
- Shard: testShard,
- }
-
- got := tr.isIncluded(tablet)
- if got != test.expected {
- t.Errorf("isIncluded(%v) for keyspace %v returned %v but expected %v", test.keyspace, test.keyspace, got, test.expected)
- }
-
- if err := ts.CreateTablet(context.Background(), tablet); err != nil {
- t.Errorf("CreateTablet failed: %v", err)
- }
-
- tw.loadTablets()
- key := TabletToMapKey(tablet)
- allTablets := hc.GetAllTablets()
-
- if _, ok := allTablets[key]; ok != test.expected && proto.Equal(allTablets[key], tablet) != test.expected {
- t.Errorf("Error adding tablet - got %v; want %v", ok, test.expected)
- }
-
- // Replace the tablet we added above
- tabletReplacement := &topodatapb.Tablet{
- Alias: &topodatapb.TabletAlias{
- Cell: testCell,
- Uid: rand.Uint32(),
- },
- Hostname: testHostName,
- PortMap: map[string]int32{
- "vt": port,
- },
- Keyspace: test.keyspace,
- Shard: testShard,
- }
- got = tr.isIncluded(tabletReplacement)
- if got != test.expected {
- t.Errorf("isIncluded(%v) for keyspace %v returned %v but expected %v", test.keyspace, test.keyspace, got, test.expected)
- }
- if err := ts.CreateTablet(context.Background(), tabletReplacement); err != nil {
- t.Errorf("CreateTablet failed: %v", err)
- }
-
- tw.loadTablets()
- key = TabletToMapKey(tabletReplacement)
- allTablets = hc.GetAllTablets()
-
- if _, ok := allTablets[key]; ok != test.expected && proto.Equal(allTablets[key], tabletReplacement) != test.expected {
- t.Errorf("Error replacing tablet - got %v; want %v", ok, test.expected)
- }
-
- // Delete the tablet
- if err := ts.DeleteTablet(context.Background(), tabletReplacement.Alias); err != nil {
- t.Fatalf("DeleteTablet failed: %v", err)
- }
- }
-}
diff --git a/go/vt/discovery/replicationlag_test.go b/go/vt/discovery/replicationlag_test.go
index bd879b01d3d..16354a4d54f 100644
--- a/go/vt/discovery/replicationlag_test.go
+++ b/go/vt/discovery/replicationlag_test.go
@@ -26,6 +26,11 @@ import (
"vitess.io/vitess/go/vt/topo"
)
+// testSetLegacyReplicationLagAlgorithm is a test helper function, if this is used by a production code path, something is wrong.
+func testSetLegacyReplicationLagAlgorithm(newLegacy bool) {
+ *legacyReplicationLagAlgorithm = newLegacy
+}
+
// testSetMinNumTablets is a test helper function, if this is used by a production code path, something is wrong.
func testSetMinNumTablets(newMin int) {
*minNumTablets = newMin
diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go
index c510a7ee871..a2a70a1f2d0 100644
--- a/go/vt/discovery/topology_watcher.go
+++ b/go/vt/discovery/topology_watcher.go
@@ -66,7 +66,7 @@ type tabletInfo struct {
type TopologyWatcher struct {
// set at construction time
topoServer *topo.Server
- tabletRecorder TabletRecorder
+ healthcheck HealthCheck
tabletFilter TabletFilter
cell string
refreshInterval time.Duration
@@ -94,10 +94,10 @@ type TopologyWatcher struct {
// NewTopologyWatcher returns a TopologyWatcher that monitors all
// the tablets in a cell, and starts refreshing.
-func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr TabletRecorder, filter TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error)) *TopologyWatcher {
+func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, filter TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int, getTablets func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error)) *TopologyWatcher {
tw := &TopologyWatcher{
topoServer: topoServer,
- tabletRecorder: tr,
+ healthcheck: hc,
tabletFilter: filter,
cell: cell,
refreshInterval: refreshInterval,
@@ -116,8 +116,8 @@ func NewTopologyWatcher(ctx context.Context, topoServer *topo.Server, tr TabletR
// NewCellTabletsWatcher returns a TopologyWatcher that monitors all
// the tablets in a cell, and starts refreshing.
-func NewCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, tr TabletRecorder, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher {
- return NewTopologyWatcher(ctx, topoServer, tr, f, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) {
+func NewCellTabletsWatcher(ctx context.Context, topoServer *topo.Server, hc HealthCheck, f TabletFilter, cell string, refreshInterval time.Duration, refreshKnownTablets bool, topoReadConcurrency int) *TopologyWatcher {
+ return NewTopologyWatcher(ctx, topoServer, hc, f, cell, refreshInterval, refreshKnownTablets, topoReadConcurrency, func(tw *TopologyWatcher) ([]*topodata.TabletAlias, error) {
return tw.topoServer.GetTabletAliasesByCell(ctx, tw.cell)
})
}
@@ -225,19 +225,19 @@ func (tw *TopologyWatcher) loadTablets() {
if oldKey != newKey {
// This is the case where the same tablet alias is now reporting
// a different address (host:port) key.
- tw.tabletRecorder.ReplaceTablet(val.tablet, newVal.tablet)
+ tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet)
topologyWatcherOperations.Add(topologyWatcherOpReplaceTablet, 1)
}
} else {
// This is a new tablet record, let's add it to the healthcheck
- tw.tabletRecorder.AddTablet(newVal.tablet)
+ tw.healthcheck.AddTablet(newVal.tablet)
topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1)
}
}
for _, val := range tw.tablets {
if _, ok := newTablets[val.alias]; !ok {
- tw.tabletRecorder.RemoveTablet(val.tablet)
+ tw.healthcheck.RemoveTablet(val.tablet)
topologyWatcherOperations.Add(topologyWatcherOpRemoveTablet, 1)
}
}
diff --git a/go/vt/discovery/utils.go b/go/vt/discovery/utils.go
index 32faee8b083..02f3b7132af 100644
--- a/go/vt/discovery/utils.go
+++ b/go/vt/discovery/utils.go
@@ -24,16 +24,23 @@ import (
)
// This file contains helper filter methods to process the unfiltered list of
-// tablets returned by LegacyHealthCheck.GetTabletStatsFrom*.
-// See also legacy_replicationlag.go for a more sophisicated filter used by vtgate.
+// tablets returned by HealthCheckImpl.GetTabletHealth*.
+
+func TabletHealthReferenceListToValue(thl []*TabletHealth) []TabletHealth {
+ newTh := []TabletHealth{}
+ for _, th := range thl {
+ newTh = append(newTh, *th)
+ }
+ return newTh
+}
// RemoveUnhealthyTablets filters all unhealthy tablets out.
// NOTE: Non-serving tablets are considered healthy.
-func RemoveUnhealthyTablets(tabletStatsList []LegacyTabletStats) []LegacyTabletStats {
- result := make([]LegacyTabletStats, 0, len(tabletStatsList))
+func RemoveUnhealthyTablets(tabletStatsList []TabletHealth) []TabletHealth {
+ result := make([]TabletHealth, 0, len(tabletStatsList))
for _, ts := range tabletStatsList {
// Note we do not check the 'Serving' flag here.
- if ts.Stats == nil || ts.Stats.HealthError != "" || ts.LastError != nil || LegacyIsReplicationLagHigh(&ts) {
+ if ts.LastError != nil || ts.Stats != nil && (ts.Stats.HealthError != "" || IsReplicationLagHigh(&ts)) {
continue
}
result = append(result, ts)
diff --git a/go/vt/discovery/utils_test.go b/go/vt/discovery/utils_test.go
index 501c3a03832..27416da44b0 100644
--- a/go/vt/discovery/utils_test.go
+++ b/go/vt/discovery/utils_test.go
@@ -27,66 +27,68 @@ import (
func TestRemoveUnhealthyTablets(t *testing.T) {
var testcases = []struct {
desc string
- input []LegacyTabletStats
- want []LegacyTabletStats
+ input []TabletHealth
+ want []TabletHealth
}{{
desc: "tablets missing Stats",
- input: []LegacyTabletStats{replica(1), replica(2)},
- want: []LegacyTabletStats{},
+ input: []TabletHealth{replica(1), replica(2)},
+ want: []TabletHealth{replica(1), replica(2)},
}, {
desc: "all tablets healthy",
- input: []LegacyTabletStats{healthy(replica(1)), healthy(replica(2))},
- want: []LegacyTabletStats{healthy(replica(1)), healthy(replica(2))},
+ input: []TabletHealth{healthy(replica(1)), healthy(replica(2))},
+ want: []TabletHealth{healthy(replica(1)), healthy(replica(2))},
}, {
desc: "one unhealthy tablet (error)",
- input: []LegacyTabletStats{healthy(replica(1)), unhealthyError(replica(2))},
- want: []LegacyTabletStats{healthy(replica(1))},
+ input: []TabletHealth{healthy(replica(1)), unhealthyError(replica(2))},
+ want: []TabletHealth{healthy(replica(1))},
}, {
desc: "one error tablet",
- input: []LegacyTabletStats{healthy(replica(1)), unhealthyLastError(replica(2))},
- want: []LegacyTabletStats{healthy(replica(1))},
+ input: []TabletHealth{healthy(replica(1)), unhealthyLastError(replica(2))},
+ want: []TabletHealth{healthy(replica(1))},
}, {
desc: "one unhealthy tablet (lag)",
- input: []LegacyTabletStats{healthy(replica(1)), unhealthyLag(replica(2))},
- want: []LegacyTabletStats{healthy(replica(1))},
+ input: []TabletHealth{healthy(replica(1)), unhealthyLag(replica(2))},
+ want: []TabletHealth{healthy(replica(1))},
}, {
desc: "no filtering by tablet type",
- input: []LegacyTabletStats{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))},
- want: []LegacyTabletStats{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))},
+ input: []TabletHealth{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))},
+ want: []TabletHealth{healthy(primary(1)), healthy(replica(2)), healthy(rdonly(3))},
}, {
desc: "non-serving tablets won't be removed",
- input: []LegacyTabletStats{notServing(healthy(replica(1)))},
- want: []LegacyTabletStats{notServing(healthy(replica(1)))},
+ input: []TabletHealth{notServing(healthy(replica(1)))},
+ want: []TabletHealth{notServing(healthy(replica(1)))},
}}
for _, tc := range testcases {
- got := RemoveUnhealthyTablets(tc.input)
- if len(got) != len(tc.want) {
- t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want)
- } else {
- for i := range tc.want {
- if !got[i].DeepEqual(&tc.want[i]) {
- t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want)
+ t.Run(tc.desc, func(t *testing.T) {
+ got := RemoveUnhealthyTablets(tc.input)
+ if len(got) != len(tc.want) {
+ t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want)
+ } else {
+ for i := range tc.want {
+ if !got[i].DeepEqual(&tc.want[i]) {
+ t.Errorf("test case '%v' failed: RemoveUnhealthyTablets(%v) = %#v, want: %#v", tc.desc, tc.input, got, tc.want)
+ }
}
}
- }
+ })
}
}
-func primary(uid uint32) LegacyTabletStats {
+func primary(uid uint32) TabletHealth {
return minimalTabletStats(uid, topodatapb.TabletType_PRIMARY)
}
-func replica(uid uint32) LegacyTabletStats {
+func replica(uid uint32) TabletHealth {
return minimalTabletStats(uid, topodatapb.TabletType_REPLICA)
}
-func rdonly(uid uint32) LegacyTabletStats {
+func rdonly(uid uint32) TabletHealth {
return minimalTabletStats(uid, topodatapb.TabletType_RDONLY)
}
-func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) LegacyTabletStats {
- return LegacyTabletStats{
+func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) TabletHealth {
+ return TabletHealth{
Tablet: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{
Uid: uid},
@@ -100,33 +102,33 @@ func minimalTabletStats(uid uint32, tabletType topodatapb.TabletType) LegacyTabl
}
}
-func healthy(ts LegacyTabletStats) LegacyTabletStats {
+func healthy(ts TabletHealth) TabletHealth {
ts.Stats = &querypb.RealtimeStats{
ReplicationLagSeconds: uint32(1),
}
return ts
}
-func unhealthyLag(ts LegacyTabletStats) LegacyTabletStats {
+func unhealthyLag(ts TabletHealth) TabletHealth {
ts.Stats = &querypb.RealtimeStats{
ReplicationLagSeconds: uint32(3600),
}
return ts
}
-func unhealthyError(ts LegacyTabletStats) LegacyTabletStats {
+func unhealthyError(ts TabletHealth) TabletHealth {
ts.Stats = &querypb.RealtimeStats{
HealthError: "unhealthy",
}
return ts
}
-func unhealthyLastError(ts LegacyTabletStats) LegacyTabletStats {
+func unhealthyLastError(ts TabletHealth) TabletHealth {
ts.LastError = errors.New("err")
return ts
}
-func notServing(ts LegacyTabletStats) LegacyTabletStats {
+func notServing(ts TabletHealth) TabletHealth {
ts.Serving = false
return ts
}
diff --git a/go/vt/throttler/demo/throttler_demo.go b/go/vt/throttler/demo/throttler_demo.go
index 9d6cc7b93ba..a098b032b67 100644
--- a/go/vt/throttler/demo/throttler_demo.go
+++ b/go/vt/throttler/demo/throttler_demo.go
@@ -17,6 +17,7 @@ limitations under the License.
package main
import (
+ "context"
"flag"
"math/rand"
"net/http"
@@ -25,9 +26,11 @@ import (
"time"
"vitess.io/vitess/go/vt/discovery"
+ "vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
"vitess.io/vitess/go/vt/servenv"
"vitess.io/vitess/go/vt/throttler"
+ "vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/memorytopo"
"vitess.io/vitess/go/vt/vttablet/grpcqueryservice"
"vitess.io/vitess/go/vt/vttablet/queryservice/fakes"
@@ -37,8 +40,6 @@ import (
querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
-
- "vitess.io/vitess/go/vt/log"
)
// This file contains a demo binary that demonstrates how the resharding
@@ -103,9 +104,8 @@ type replica struct {
wg sync.WaitGroup
}
-func newReplica(lagUpdateInterval, degrationInterval, degrationDuration time.Duration) *replica {
+func newReplica(lagUpdateInterval, degrationInterval, degrationDuration time.Duration, ts *topo.Server) *replica {
t := &testing.T{}
- ts := memorytopo.NewServer("cell1")
wr := wrangler.New(logutil.NewConsoleLogger(), ts, tmclient.NewTabletManagerClient())
fakeTablet := testlib.NewFakeTablet(t, wr, "cell1", 0,
topodatapb.TabletType_REPLICA, nil, testlib.TabletKeyspaceShard(t, "ks", "-80"))
@@ -213,28 +213,30 @@ func (r *replica) stop() {
type client struct {
primary *primary
- healthCheck discovery.LegacyHealthCheck
+ healthCheck discovery.HealthCheck
throttler *throttler.Throttler
- stopChan chan struct{}
- wg sync.WaitGroup
+ stopChan chan struct{}
+ wg sync.WaitGroup
+ healthcheckCh chan *discovery.TabletHealth
}
-func newClient(primary *primary, replica *replica) *client {
+func newClient(primary *primary, replica *replica, ts *topo.Server) *client {
t, err := throttler.NewThrottler("client", "TPS", 1, throttler.MaxRateModuleDisabled, 5 /* seconds */)
if err != nil {
log.Fatal(err)
}
- healthCheck := discovery.NewLegacyHealthCheck(5*time.Second, 1*time.Minute)
+ healthCheck := discovery.NewHealthCheck(context.Background(), 5*time.Second, 1*time.Minute, ts, "cell1", "")
c := &client{
primary: primary,
healthCheck: healthCheck,
throttler: t,
stopChan: make(chan struct{}),
}
- c.healthCheck.SetListener(c, false /* sendDownEvents */)
- c.healthCheck.AddTablet(replica.fakeTablet.Tablet, "name")
+ healthcheckCh := c.healthCheck.Subscribe()
+ c.healthcheckCh = healthcheckCh
+ c.healthCheck.AddTablet(replica.fakeTablet.Tablet)
return c
}
@@ -250,6 +252,8 @@ func (c *client) loop() {
select {
case <-c.stopChan:
return
+ case th := <-c.healthcheckCh:
+ c.StatsUpdate(th)
default:
}
@@ -273,10 +277,9 @@ func (c *client) stop() {
c.throttler.Close()
}
-// StatsUpdate implements discovery.LegacyHealthCheckStatsListener.
-// It gets called by the healthCheck instance every time a tablet broadcasts
+// StatsUpdate gets called by the healthCheck instance every time a tablet broadcasts
// a health update.
-func (c *client) StatsUpdate(ts *discovery.LegacyTabletStats) {
+func (c *client) StatsUpdate(ts *discovery.TabletHealth) {
// Ignore unless REPLICA or RDONLY.
if ts.Target.TabletType != topodatapb.TabletType_REPLICA && ts.Target.TabletType != topodatapb.TabletType_RDONLY {
return
@@ -294,9 +297,10 @@ func main() {
})
log.Infof("start rate set to: %v", *rate)
- replica := newReplica(*lagUpdateInterval, *replicaDegrationInterval, *replicaDegrationDuration)
+ ts := memorytopo.NewServer("cell1")
+ replica := newReplica(*lagUpdateInterval, *replicaDegrationInterval, *replicaDegrationDuration, ts)
primary := &primary{replica: replica}
- client := newClient(primary, replica)
+ client := newClient(primary, replica, ts)
client.run()
time.Sleep(*duration)
diff --git a/go/vt/throttler/manager.go b/go/vt/throttler/manager.go
index 32790a9d601..c2ee9f0a652 100644
--- a/go/vt/throttler/manager.go
+++ b/go/vt/throttler/manager.go
@@ -205,9 +205,9 @@ func (m *managerImpl) throttlerNamesLocked() []string {
return names
}
-// Log returns the most recent changes of the MaxReplicationLag module.
+// log returns the most recent changes of the MaxReplicationLag module.
// There will be one result for each processed replication lag record.
-func (m *managerImpl) Log(throttlerName string) ([]result, error) {
+func (m *managerImpl) log(throttlerName string) ([]result, error) {
m.mu.Lock()
defer m.mu.Unlock()
@@ -216,5 +216,5 @@ func (m *managerImpl) Log(throttlerName string) ([]result, error) {
return nil, fmt.Errorf("throttler: %v does not exist", throttlerName)
}
- return t.Log(), nil
+ return t.log(), nil
}
diff --git a/go/vt/throttler/max_replication_lag_module.go b/go/vt/throttler/max_replication_lag_module.go
index 68b31147965..f8037f7f975 100644
--- a/go/vt/throttler/max_replication_lag_module.go
+++ b/go/vt/throttler/max_replication_lag_module.go
@@ -54,7 +54,7 @@ const (
// i.e. we'll ignore lag records with lower lag from other replicas while we're
// waiting for the next record of this replica under test.
type replicaUnderTest struct {
- // key holds the discovery.LegacyTabletStats.Key value for the replica.
+ // key holds the key value for the replica.
key string
alias string
tabletType topodatapb.TabletType
@@ -114,8 +114,8 @@ type MaxReplicationLagModule struct {
// max rate calculation has changed. The field is immutable (set in Start().)
rateUpdateChan chan<- struct{}
- // lagRecords buffers the replication lag records received by the LegacyHealthCheck
- // listener. ProcessRecords() will process them.
+ // lagRecords buffers the replication lag records received by the HealthCheck
+ // subscriber. ProcessRecords() will process them.
lagRecords chan replicationLagRecord
wg sync.WaitGroup
@@ -238,7 +238,7 @@ func (m *MaxReplicationLagModule) resetConfiguration() {
}
// RecordReplicationLag records the current replication lag for processing.
-func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, ts *discovery.LegacyTabletStats) {
+func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, th *discovery.TabletHealth) {
m.mutableConfigMu.Lock()
if m.mutableConfig.MaxReplicationLagSec == ReplicationLagModuleDisabled {
m.mutableConfigMu.Unlock()
@@ -246,9 +246,9 @@ func (m *MaxReplicationLagModule) RecordReplicationLag(t time.Time, ts *discover
}
m.mutableConfigMu.Unlock()
- // Buffer data point for now to unblock the LegacyHealthCheck listener and process
+ // Buffer data point for now to unblock the HealthCheck subscriber and process
// it asynchronously in ProcessRecords().
- m.lagRecords <- replicationLagRecord{t, *ts}
+ m.lagRecords <- replicationLagRecord{t, *th}
}
// ProcessRecords is the main loop, run in a separate Go routine, which
@@ -331,7 +331,7 @@ func (m *MaxReplicationLagModule) recalculateRate(lagRecordNow replicationLagRec
var clear bool
var clearReason string
- if m.lagCache(lagRecordNow).ignoreSlowReplica(lagRecordNow.Key) {
+ if m.lagCache(lagRecordNow).ignoreSlowReplica(discovery.TabletToMapKey(lagRecordNow.Tablet)) {
r.Reason = fmt.Sprintf("skipping this replica because it's among the %d slowest %v tablets", m.getNSlowestReplicasConfig(lagRecordNow), lagRecordNow.Target.TabletType.String())
goto logResult
}
@@ -394,7 +394,7 @@ func (m *MaxReplicationLagModule) clearReplicaUnderTest(now time.Time, testedSta
// Verify that the current replica under test is not in an error state.
lr := lagRecordNow
- if m.replicaUnderTest.key != lr.Key {
+ if m.replicaUnderTest.key != discovery.TabletToMapKey(lr.Tablet) {
lr = m.lagCacheByType(m.replicaUnderTest.tabletType).latest(m.replicaUnderTest.key)
}
if lr.isZero() {
@@ -402,7 +402,7 @@ func (m *MaxReplicationLagModule) clearReplicaUnderTest(now time.Time, testedSta
return true, "it is no longer actively tracked"
}
if lr.LastError != nil {
- // LastError is set i.e. LegacyHealthCheck module cannot connect and the cached
+ // LastError is set i.e. HealthCheck module cannot connect and the cached
// data for the replica might be outdated.
return true, "it has LastError set i.e. is no longer correctly tracked"
}
@@ -445,7 +445,7 @@ func (m *MaxReplicationLagModule) isReplicaUnderTest(r *result, now time.Time, t
return true
}
- if m.replicaUnderTest.key != lagRecordNow.Key {
+ if m.replicaUnderTest.key != discovery.TabletToMapKey(lagRecordNow.Tablet) {
r.Reason = fmt.Sprintf("skipping this replica because we're waiting for the next lag record from the 'replica under test': %v", m.replicaUnderTest.alias)
return false
}
@@ -557,7 +557,7 @@ func (m *MaxReplicationLagModule) minTestDurationUntilNextIncrease(increase floa
func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time, lagRecordNow replicationLagRecord) {
// Guess replication rate based on the difference in the replication lag of this
// particular replica.
- lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(lagRecordNow.Key, m.lastRateChange)
+ lagRecordBefore := m.lagCache(lagRecordNow).atOrAfter(discovery.TabletToMapKey(lagRecordNow.Tablet), m.lastRateChange)
if lagRecordBefore.isZero() {
// We should see at least "lagRecordNow" here because we did just insert it
// in processRecord().
@@ -592,7 +592,7 @@ func (m *MaxReplicationLagModule) decreaseAndGuessRate(r *result, now time.Time,
if replicationLagChange == equal {
// The replication lag did not change. Keep going at the current rate.
- r.Reason = fmt.Sprintf("did not decrease the rate because the lag did not change (assuming a 1s error margin)") //nolint
+ r.Reason = fmt.Sprintf("did not decrease the rate because the lag did not change (assuming a 1s error margin)") // nolint
return
}
@@ -705,7 +705,7 @@ func (m *MaxReplicationLagModule) updateRate(r *result, newState state, rate int
}
m.lastRateChange = now
- m.replicaUnderTest = &replicaUnderTest{lagRecordNow.Key, topoproto.TabletAliasString(lagRecordNow.Tablet.Alias), lagRecordNow.Target.TabletType, newState, now.Add(testDuration)}
+ m.replicaUnderTest = &replicaUnderTest{discovery.TabletToMapKey(lagRecordNow.Tablet), topoproto.TabletAliasString(lagRecordNow.Tablet.Alias), lagRecordNow.Target.TabletType, newState, now.Add(testDuration)}
if rate == oldRate {
return
diff --git a/go/vt/throttler/max_replication_lag_module_config.go b/go/vt/throttler/max_replication_lag_module_config.go
index e6a8e8a8494..775aa4639a4 100644
--- a/go/vt/throttler/max_replication_lag_module_config.go
+++ b/go/vt/throttler/max_replication_lag_module_config.go
@@ -85,51 +85,51 @@ func NewMaxReplicationLagModuleConfig(maxReplicationLag int64) MaxReplicationLag
// in protobuf.
// Verify returns an error if the config is invalid.
-func (c MaxReplicationLagModuleConfig) Verify() error {
- if c.TargetReplicationLagSec < 1 {
+func (cfg MaxReplicationLagModuleConfig) Verify() error {
+ if cfg.TargetReplicationLagSec < 1 {
return fmt.Errorf("target_replication_lag_sec must be >= 1")
}
- if c.MaxReplicationLagSec < 2 {
+ if cfg.MaxReplicationLagSec < 2 {
return fmt.Errorf("max_replication_lag_sec must be >= 2")
}
- if c.TargetReplicationLagSec > c.MaxReplicationLagSec {
+ if cfg.TargetReplicationLagSec > cfg.MaxReplicationLagSec {
return fmt.Errorf("target_replication_lag_sec must not be higher than max_replication_lag_sec: invalid: %v > %v",
- c.TargetReplicationLagSec, c.MaxReplicationLagSec)
+ cfg.TargetReplicationLagSec, cfg.MaxReplicationLagSec)
}
- if c.InitialRate < 1 {
+ if cfg.InitialRate < 1 {
return fmt.Errorf("initial_rate must be >= 1")
}
- if c.MaxIncrease <= 0 {
+ if cfg.MaxIncrease <= 0 {
return fmt.Errorf("max_increase must be > 0")
}
- if c.EmergencyDecrease <= 0 {
+ if cfg.EmergencyDecrease <= 0 {
return fmt.Errorf("emergency_decrease must be > 0")
}
- if c.MinDurationBetweenIncreasesSec < 1 {
+ if cfg.MinDurationBetweenIncreasesSec < 1 {
return fmt.Errorf("min_duration_between_increases_sec must be >= 1")
}
- if c.MaxDurationBetweenIncreasesSec < 1 {
+ if cfg.MaxDurationBetweenIncreasesSec < 1 {
return fmt.Errorf("max_duration_between_increases_sec must be >= 1")
}
- if c.MinDurationBetweenDecreasesSec < 1 {
+ if cfg.MinDurationBetweenDecreasesSec < 1 {
return fmt.Errorf("min_duration_between_decreases_sec must be >= 1")
}
- if c.SpreadBacklogAcrossSec < 1 {
+ if cfg.SpreadBacklogAcrossSec < 1 {
return fmt.Errorf("spread_backlog_across_sec must be >= 1")
}
- if c.IgnoreNSlowestReplicas < 0 {
+ if cfg.IgnoreNSlowestReplicas < 0 {
return fmt.Errorf("ignore_n_slowest_replicas must be >= 0")
}
- if c.IgnoreNSlowestRdonlys < 0 {
+ if cfg.IgnoreNSlowestRdonlys < 0 {
return fmt.Errorf("ignore_n_slowest_rdonlys must be >= 0")
}
- if c.AgeBadRateAfterSec < 1 {
+ if cfg.AgeBadRateAfterSec < 1 {
return fmt.Errorf("age_bad_rate_after_sec must be >= 1")
}
- if c.MaxRateApproachThreshold < 0 {
+ if cfg.MaxRateApproachThreshold < 0 {
return fmt.Errorf("max_rate_approach_threshold must be >=0")
}
- if c.MaxRateApproachThreshold > 1 {
+ if cfg.MaxRateApproachThreshold > 1 {
return fmt.Errorf("max_rate_approach_threshold must be <=1")
}
return nil
@@ -137,30 +137,30 @@ func (c MaxReplicationLagModuleConfig) Verify() error {
// MinDurationBetweenIncreases is a helper function which returns the respective
// protobuf field as native Go type.
-func (c MaxReplicationLagModuleConfig) MinDurationBetweenIncreases() time.Duration {
- return time.Duration(c.MinDurationBetweenIncreasesSec) * time.Second
+func (cfg MaxReplicationLagModuleConfig) MinDurationBetweenIncreases() time.Duration {
+ return time.Duration(cfg.MinDurationBetweenIncreasesSec) * time.Second
}
// MaxDurationBetweenIncreases is a helper function which returns the respective
// protobuf field as native Go type.
-func (c MaxReplicationLagModuleConfig) MaxDurationBetweenIncreases() time.Duration {
- return time.Duration(c.MaxDurationBetweenIncreasesSec) * time.Second
+func (cfg MaxReplicationLagModuleConfig) MaxDurationBetweenIncreases() time.Duration {
+ return time.Duration(cfg.MaxDurationBetweenIncreasesSec) * time.Second
}
// MinDurationBetweenDecreases is a helper function which returns the respective
// protobuf field as native Go type.
-func (c MaxReplicationLagModuleConfig) MinDurationBetweenDecreases() time.Duration {
- return time.Duration(c.MinDurationBetweenDecreasesSec) * time.Second
+func (cfg MaxReplicationLagModuleConfig) MinDurationBetweenDecreases() time.Duration {
+ return time.Duration(cfg.MinDurationBetweenDecreasesSec) * time.Second
}
// SpreadBacklogAcross is a helper function which returns the respective
// protobuf field as native Go type.
-func (c MaxReplicationLagModuleConfig) SpreadBacklogAcross() time.Duration {
- return time.Duration(c.SpreadBacklogAcrossSec) * time.Second
+func (cfg MaxReplicationLagModuleConfig) SpreadBacklogAcross() time.Duration {
+ return time.Duration(cfg.SpreadBacklogAcrossSec) * time.Second
}
// AgeBadRateAfter is a helper function which returns the respective
// protobuf field as native Go type.
-func (c MaxReplicationLagModuleConfig) AgeBadRateAfter() time.Duration {
- return time.Duration(c.AgeBadRateAfterSec) * time.Second
+func (cfg MaxReplicationLagModuleConfig) AgeBadRateAfter() time.Duration {
+ return time.Duration(cfg.AgeBadRateAfterSec) * time.Second
}
diff --git a/go/vt/throttler/max_replication_lag_module_test.go b/go/vt/throttler/max_replication_lag_module_test.go
index 1cb74c79e65..f0324df192c 100644
--- a/go/vt/throttler/max_replication_lag_module_test.go
+++ b/go/vt/throttler/max_replication_lag_module_test.go
@@ -225,7 +225,7 @@ func TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp(t *testing.T)
// r2 @ 75s, 0s lag, LastError set
rError := lagRecord(sinceZero(75*time.Second), r2, 0)
- rError.LastError = errors.New("LegacyHealthCheck reporting broken")
+ rError.LastError = errors.New("HealthCheck reporting broken")
tf.m.replicaLagCache.add(rError)
// r1 @ 110s, 0s lag
@@ -244,7 +244,7 @@ func TestMaxReplicationLagModule_ReplicaUnderTest_LastErrorOrNotUp(t *testing.T)
tf.ratesHistory.add(sinceZero(110*time.Second), 200)
tf.ratesHistory.add(sinceZero(114*time.Second), 400)
rNotUp := lagRecord(sinceZero(115*time.Second), r1, 0)
- rNotUp.Up = false
+ rNotUp.Serving = false
tf.m.replicaLagCache.add(rNotUp)
// r2 @ 150s, 0s lag (lastError no longer set)
@@ -453,7 +453,7 @@ func TestMaxReplicationLagModule_Increase_BadRateUpperBound(t *testing.T) {
t.Fatal(err)
}
- //Assume that a bad value of 150 was set @ 30s and log error
+ // Assume that a bad value of 150 was set @ 30s and log error
if err := tf.m.memory.markBad(150, sinceZero(30*time.Second)); err != nil {
log.Errorf("tf.m.memory.markBad(150, sinceZero(30*time.Second)) falied : %v", err)
}
@@ -955,7 +955,7 @@ func lagRecord(t time.Time, uid, lag uint32) replicationLagRecord {
}
// tabletStats creates fake tablet health data.
-func tabletStats(uid, lag uint32) discovery.LegacyTabletStats {
+func tabletStats(uid, lag uint32) discovery.TabletHealth {
typ := topodatapb.TabletType_REPLICA
if uid == rdonly1 || uid == rdonly2 {
typ = topodatapb.TabletType_RDONLY
@@ -967,21 +967,19 @@ func tabletStats(uid, lag uint32) discovery.LegacyTabletStats {
Type: typ,
PortMap: map[string]int32{"vt": int32(uid)},
}
- return discovery.LegacyTabletStats{
+ return discovery.TabletHealth{
Tablet: tablet,
- Key: discovery.TabletToMapKey(tablet),
Target: &querypb.Target{
Keyspace: "ks1",
Shard: "-80",
TabletType: typ,
},
- Up: true,
Serving: true,
Stats: &querypb.RealtimeStats{
ReplicationLagSeconds: lag,
},
- TabletExternallyReparentedTimestamp: 22,
- LastError: nil,
+ PrimaryTermStartTime: 22,
+ LastError: nil,
}
}
diff --git a/go/vt/throttler/replication_lag_cache.go b/go/vt/throttler/replication_lag_cache.go
index ee4a5b18377..c9c2e94f113 100644
--- a/go/vt/throttler/replication_lag_cache.go
+++ b/go/vt/throttler/replication_lag_cache.go
@@ -60,17 +60,17 @@ func newReplicationLagCache(historyCapacityPerReplica int) *replicationLagCache
// add inserts or updates "r" in the cache for the replica with the key "r.Key".
func (c *replicationLagCache) add(r replicationLagRecord) {
- if !r.Up {
+ if !r.Serving {
// Tablet is down. Do no longer track it.
- delete(c.entries, r.Key)
- delete(c.ignoredSlowReplicasInARow, r.Key)
+ delete(c.entries, discovery.TabletToMapKey(r.Tablet))
+ delete(c.ignoredSlowReplicasInARow, discovery.TabletToMapKey(r.Tablet))
return
}
- entry, ok := c.entries[r.Key]
+ entry, ok := c.entries[discovery.TabletToMapKey(r.Tablet)]
if !ok {
entry = newReplicationLagHistory(c.historyCapacityPerReplica)
- c.entries[r.Key] = entry
+ c.entries[discovery.TabletToMapKey(r.Tablet)] = entry
}
entry.add(r)
@@ -114,7 +114,7 @@ func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumRepli
for _, v := range c.entries {
record := v.latest()
if int64(record.Stats.ReplicationLagSeconds) >= minimumReplicationLag {
- list = append(list, record.LegacyTabletStats)
+ list = append(list, record.TabletHealth)
i++
}
}
@@ -122,13 +122,13 @@ func (c *replicationLagCache) sortByLag(ignoreNSlowestReplicas int, minimumRepli
// Now remember the N slowest replicas.
for i := len(list) - 1; len(list) > 0 && i >= len(list)-ignoreNSlowestReplicas; i-- {
- c.slowReplicas[list[i].Key] = true
+ c.slowReplicas[discovery.TabletToMapKey(list[i].Tablet)] = true
}
}
-// byLagAndTabletUID is a slice of discovery.LegacyTabletStats elements that
+// byLagAndTabletUID is a slice of discovery.TabletHealth elements that
// implements sort.Interface to sort by replication lag and tablet Uid.
-type byLagAndTabletUID []discovery.LegacyTabletStats
+type byLagAndTabletUID []discovery.TabletHealth
func (a byLagAndTabletUID) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byLagAndTabletUID) Len() int { return len(a) }
diff --git a/go/vt/throttler/replication_lag_cache_test.go b/go/vt/throttler/replication_lag_cache_test.go
index c2381f05246..312f97e1999 100644
--- a/go/vt/throttler/replication_lag_cache_test.go
+++ b/go/vt/throttler/replication_lag_cache_test.go
@@ -19,6 +19,8 @@ package throttler
import (
"testing"
"time"
+
+ "vitess.io/vitess/go/vt/discovery"
)
// TestReplicationLagCache tests that the ring buffer in "replicationLagHistory"
@@ -27,7 +29,7 @@ import (
// max_replication_lag_module_test.go.
func TestReplicationLagCache(t *testing.T) {
c := newReplicationLagCache(2)
- r1Key := tabletStats(r1, 1).Key
+ r1Key := discovery.TabletToMapKey(tabletStats(r1, 1).Tablet)
// If there is no entry yet, a zero struct is returned.
zeroEntry := c.atOrAfter(r1Key, sinceZero(0*time.Second))
@@ -73,7 +75,7 @@ func TestReplicationLagCache(t *testing.T) {
func TestReplicationLagCache_SortByLag(t *testing.T) {
c := newReplicationLagCache(2)
- r1Key := tabletStats(r1, 1).Key
+ r1Key := discovery.TabletToMapKey(tabletStats(r1, 1).Tablet)
c.add(lagRecord(sinceZero(1*time.Second), r1, 30))
c.sortByLag(1 /* ignoreNSlowestReplicas */, 30 /* minimumReplicationLag */)
diff --git a/go/vt/throttler/replication_lag_record.go b/go/vt/throttler/replication_lag_record.go
index 734cb3df43a..6fdccbd8810 100644
--- a/go/vt/throttler/replication_lag_record.go
+++ b/go/vt/throttler/replication_lag_record.go
@@ -23,13 +23,13 @@ import (
)
// replicationLagRecord stores the tablet health data for a given point in time.
-// This data is obtained via the LegacyHealthCheck module.
+// This data is obtained via the HealthCheck module.
type replicationLagRecord struct {
// time is the time at which "value" was observed.
time time.Time
// LegacyTabletStats holds a copy of the current health data of the tablet.
- discovery.LegacyTabletStats
+ discovery.TabletHealth
}
func (r replicationLagRecord) isZero() bool {
diff --git a/go/vt/throttler/throttler.go b/go/vt/throttler/throttler.go
index ea0096bc537..b731bcb2fbe 100644
--- a/go/vt/throttler/throttler.go
+++ b/go/vt/throttler/throttler.go
@@ -295,8 +295,8 @@ func (t *Throttler) SetMaxRate(rate int64) {
// RecordReplicationLag must be called by users to report the "ts" tablet health
// data observed at "time".
// Note: After Close() is called, this method must not be called anymore.
-func (t *Throttler) RecordReplicationLag(time time.Time, ts *discovery.LegacyTabletStats) {
- t.maxReplicationLagModule.RecordReplicationLag(time, ts)
+func (t *Throttler) RecordReplicationLag(time time.Time, th *discovery.TabletHealth) {
+ t.maxReplicationLagModule.RecordReplicationLag(time, th)
}
// GetConfiguration returns the configuration of the MaxReplicationLag module.
@@ -315,7 +315,7 @@ func (t *Throttler) ResetConfiguration() {
t.maxReplicationLagModule.resetConfiguration()
}
-// Log returns the most recent changes of the MaxReplicationLag module.
-func (t *Throttler) Log() []result {
+// log returns the most recent changes of the MaxReplicationLag module.
+func (t *Throttler) log() []result {
return t.maxReplicationLagModule.log()
}
diff --git a/go/vt/throttler/throttlerlogz.go b/go/vt/throttler/throttlerlogz.go
index 23b49bdc070..6952b34feec 100644
--- a/go/vt/throttler/throttlerlogz.go
+++ b/go/vt/throttler/throttlerlogz.go
@@ -125,7 +125,7 @@ func throttlerlogzHandler(w http.ResponseWriter, r *http.Request, m *managerImpl
}
func showThrottlerLog(w http.ResponseWriter, m *managerImpl, name string) {
- results, err := m.Log(name)
+ results, err := m.log(name)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go
index 3279a87f20b..683c51662e8 100644
--- a/go/vt/vtctl/vtctl.go
+++ b/go/vt/vtctl/vtctl.go
@@ -2269,7 +2269,7 @@ func commandVRWorkflow(ctx context.Context, wr *wrangler.Wrangler, subFlags *fla
return err
}
- //TODO: check if invalid parameters were passed in that do not apply to this action
+ // TODO: check if invalid parameters were passed in that do not apply to this action
originalAction := action
action = strings.ToLower(action) // allow users to input action in a case-insensitive manner
if workflowType == wrangler.MigrateWorkflow {
@@ -2515,7 +2515,7 @@ func commandVRWorkflow(ctx context.Context, wr *wrangler.Wrangler, subFlags *fla
func commandCreateLookupVindex(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.FlagSet, args []string) error {
cells := subFlags.String("cells", "", "Source cells to replicate from.")
- //TODO: keep --cell around for backward compatibility and remove it in a future version
+ // TODO: keep --cell around for backward compatibility and remove it in a future version
cell := subFlags.String("cell", "", "Cell to replicate from.")
tabletTypes := subFlags.String("tablet_types", "", "Source tablet types to replicate from.")
continueAfterCopyWithOwner := subFlags.Bool("continue_after_copy_with_owner", false, "Vindex will continue materialization after copy when an owner is provided")
@@ -2587,7 +2587,7 @@ func commandVDiff(ctx context.Context, wr *wrangler.Wrangler, subFlags *flag.Fla
maxRows := subFlags.Int64("limit", math.MaxInt64, "Max rows to stop comparing after")
debugQuery := subFlags.Bool("debug_query", false, "Adds a mysql query to the report that can be used for further debugging")
onlyPks := subFlags.Bool("only_pks", false, "When reporting missing rows, only show primary keys in the report.")
- format := subFlags.String("format", "", "Format of report") //"json" or ""
+ format := subFlags.String("format", "", "Format of report") // "json" or ""
tables := subFlags.String("tables", "", "Only run vdiff for these tables in the workflow")
maxExtraRowsToCompare := subFlags.Int("max_extra_rows_to_compare", 1000, "If there are collation differences between the source and target, you can have rows that are identical but simply returned in a different order from MySQL. We will do a second pass to compare the rows for any actual differences in this case and this flag allows you to control the resources used for this operation.")
if err := subFlags.Parse(args); err != nil {
diff --git a/go/vt/vtctld/api.go b/go/vt/vtctld/api.go
index 80cc988db58..781b4416246 100644
--- a/go/vt/vtctld/api.go
+++ b/go/vt/vtctld/api.go
@@ -62,7 +62,7 @@ const (
jsonContentType = "application/json; charset=utf-8"
)
-// TabletStats represents realtime stats from a discovery.LegacyTabletStats struct.
+// TabletStats represents realtime stats from a discovery.TabletHealth struct.
type TabletStats struct {
LastError string `json:"last_error,omitempty"`
Realtime *querypb.RealtimeStats `json:"realtime,omitempty"`
diff --git a/go/vt/vtgate/buffer/buffer.go b/go/vt/vtgate/buffer/buffer.go
index ab4d5a32000..25fe4181a4a 100644
--- a/go/vt/vtgate/buffer/buffer.go
+++ b/go/vt/vtgate/buffer/buffer.go
@@ -93,7 +93,7 @@ type Buffer struct {
// In particular, it is used to serialize the following Go routines:
// - 1. Requests which may buffer (RLock, can be run in parallel)
// - 2. Request which starts buffering (based on the seen error)
- // - 3. LegacyHealthCheck listener ("StatsUpdate") which stops buffering
+ // - 3. HealthCheck subscriber ("StatsUpdate") which stops buffering
// - 4. Timer which may stop buffering after -buffer_max_failover_duration
mu sync.RWMutex
// buffers holds a shardBuffer object per shard, even if no failover is in
@@ -171,29 +171,6 @@ func (b *Buffer) HandleKeyspaceEvent(ksevent *discovery.KeyspaceEvent) {
}
}
-// StatsUpdate keeps track of the "tablet_externally_reparented_timestamp" of
-// each primary. This way we can detect the end of a failover.
-// It is part of the discovery.LegacyHealthCheckStatsListener interface.
-func (b *Buffer) StatsUpdate(ts *discovery.LegacyTabletStats) {
- if ts.Target.TabletType != topodatapb.TabletType_PRIMARY {
- panic(fmt.Sprintf("BUG: non-PRIMARY LegacyTabletStats object must not be forwarded: %#v", ts))
- }
-
- timestamp := ts.TabletExternallyReparentedTimestamp
- if timestamp == 0 {
- // Primarys where TabletExternallyReparented was never called will return 0.
- // Ignore them.
- return
- }
-
- sb := b.getOrCreateBuffer(ts.Target.Keyspace, ts.Target.Shard)
- if sb == nil {
- // Buffer is shut down. Ignore all calls.
- return
- }
- sb.recordExternallyReparentedTimestamp(timestamp, ts.Tablet.Alias)
-}
-
// getOrCreateBuffer returns the ShardBuffer for the given keyspace and shard.
// It returns nil if Buffer is shut down and all calls should be ignored.
func (b *Buffer) getOrCreateBuffer(keyspace, shard string) *shardBuffer {
diff --git a/go/vt/vtgate/buffer/buffer_helper_test.go b/go/vt/vtgate/buffer/buffer_helper_test.go
index 38983426a88..442e78d08f7 100644
--- a/go/vt/vtgate/buffer/buffer_helper_test.go
+++ b/go/vt/vtgate/buffer/buffer_helper_test.go
@@ -31,17 +31,6 @@ func testAllImplementations(t *testing.T, runTest func(t *testing.T, fail failov
})
})
- t.Run("LegacyHealthCheck", func(t *testing.T) {
- t.Helper()
- runTest(t, func(buf *Buffer, tablet *topodatapb.Tablet, keyspace, shard string, now time.Time) {
- buf.StatsUpdate(&discovery.LegacyTabletStats{
- Tablet: tablet,
- Target: &query.Target{Keyspace: keyspace, Shard: shard, TabletType: topodatapb.TabletType_PRIMARY},
- TabletExternallyReparentedTimestamp: now.Unix(),
- })
- })
- })
-
t.Run("KeyspaceEvent", func(t *testing.T) {
t.Helper()
runTest(t, func(buf *Buffer, tablet *topodatapb.Tablet, keyspace, shard string, now time.Time) {
diff --git a/go/vt/vtgate/executor_test.go b/go/vt/vtgate/executor_test.go
index 10c794ab66c..0ee9fce0ed1 100644
--- a/go/vt/vtgate/executor_test.go
+++ b/go/vt/vtgate/executor_test.go
@@ -792,8 +792,6 @@ func TestExecutorShow(t *testing.T) {
}
utils.MustMatch(t, wantqr, qr, query)
- // The FakeLegacyTablets in FakeLegacyHealthCheck don't have support for these columns/values
- // So let's just be sure the statement works and we get the expected results (none)
query = "show vitess_replication_status"
qr, err = executor.Execute(ctx, "TestExecute", session, query, nil)
require.NoError(t, err)
diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go
index cf098a7b3e0..17f21f7690b 100644
--- a/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go
+++ b/go/vt/vttablet/tabletserver/txthrottler/mock_healthcheck_test.go
@@ -1,20 +1,22 @@
// Code generated by MockGen. DO NOT EDIT.
-// Source: vitess.io/vitess/go/vt/discovery (interfaces: LegacyHealthCheck)
+// Source: vitess.io/vitess/go/vt/discovery (interfaces: HealthCheck)
// Package txthrottler is a generated GoMock package.
package txthrottler
import (
+ context "context"
reflect "reflect"
gomock "github.com/golang/mock/gomock"
discovery "vitess.io/vitess/go/vt/discovery"
+ query "vitess.io/vitess/go/vt/proto/query"
topodata "vitess.io/vitess/go/vt/proto/topodata"
queryservice "vitess.io/vitess/go/vt/vttablet/queryservice"
)
-// MockHealthCheck is a mock of LegacyHealthCheck interface.
+// MockHealthCheck is a mock of HealthCheck interface.
type MockHealthCheck struct {
ctrl *gomock.Controller
recorder *MockHealthCheckMockRecorder
@@ -38,22 +40,22 @@ func (m *MockHealthCheck) EXPECT() *MockHealthCheckMockRecorder {
}
// AddTablet mocks base method.
-func (m *MockHealthCheck) AddTablet(arg0 *topodata.Tablet, arg1 string) {
+func (m *MockHealthCheck) AddTablet(arg0 *topodata.Tablet) {
m.ctrl.T.Helper()
- m.ctrl.Call(m, "AddTablet", arg0, arg1)
+ m.ctrl.Call(m, "AddTablet", arg0)
}
// AddTablet indicates an expected call of AddTablet.
-func (mr *MockHealthCheckMockRecorder) AddTablet(arg0, arg1 any) *gomock.Call {
+func (mr *MockHealthCheckMockRecorder) AddTablet(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddTablet", reflect.TypeOf((*MockHealthCheck)(nil).AddTablet), arg0, arg1)
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddTablet", reflect.TypeOf((*MockHealthCheck)(nil).AddTablet), arg0)
}
// CacheStatus mocks base method.
-func (m *MockHealthCheck) CacheStatus() discovery.LegacyTabletsCacheStatusList {
+func (m *MockHealthCheck) CacheStatus() discovery.TabletsCacheStatusList {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "CacheStatus")
- ret0, _ := ret[0].(discovery.LegacyTabletsCacheStatusList)
+ ret0, _ := ret[0].(discovery.TabletsCacheStatusList)
return ret0
}
@@ -63,6 +65,20 @@ func (mr *MockHealthCheckMockRecorder) CacheStatus() *gomock.Call {
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CacheStatus", reflect.TypeOf((*MockHealthCheck)(nil).CacheStatus))
}
+// CacheStatusMap mocks base method.
+func (m *MockHealthCheck) CacheStatusMap() map[string]*discovery.TabletsCacheStatus {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "CacheStatusMap")
+ ret0, _ := ret[0].(map[string]*discovery.TabletsCacheStatus)
+ return ret0
+}
+
+// CacheStatusMap indicates an expected call of CacheStatusMap.
+func (mr *MockHealthCheckMockRecorder) CacheStatusMap() *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CacheStatusMap", reflect.TypeOf((*MockHealthCheck)(nil).CacheStatusMap))
+}
+
// Close mocks base method.
func (m *MockHealthCheck) Close() error {
m.ctrl.T.Helper()
@@ -77,18 +93,48 @@ func (mr *MockHealthCheckMockRecorder) Close() *gomock.Call {
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockHealthCheck)(nil).Close))
}
-// GetConnection mocks base method.
-func (m *MockHealthCheck) GetConnection(arg0 string) queryservice.QueryService {
+// GetHealthyTabletStats mocks base method.
+func (m *MockHealthCheck) GetHealthyTabletStats(arg0 *query.Target) []*discovery.TabletHealth {
m.ctrl.T.Helper()
- ret := m.ctrl.Call(m, "GetConnection", arg0)
- ret0, _ := ret[0].(queryservice.QueryService)
+ ret := m.ctrl.Call(m, "GetHealthyTabletStats", arg0)
+ ret0, _ := ret[0].([]*discovery.TabletHealth)
return ret0
}
-// GetConnection indicates an expected call of GetConnection.
-func (mr *MockHealthCheckMockRecorder) GetConnection(arg0 any) *gomock.Call {
+// GetHealthyTabletStats indicates an expected call of GetHealthyTabletStats.
+func (mr *MockHealthCheckMockRecorder) GetHealthyTabletStats(arg0 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHealthyTabletStats", reflect.TypeOf((*MockHealthCheck)(nil).GetHealthyTabletStats), arg0)
+}
+
+// GetTabletHealth mocks base method.
+func (m *MockHealthCheck) GetTabletHealth(arg0 discovery.KeyspaceShardTabletType, arg1 *topodata.TabletAlias) (*discovery.TabletHealth, error) {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "GetTabletHealth", arg0, arg1)
+ ret0, _ := ret[0].(*discovery.TabletHealth)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
+}
+
+// GetTabletHealth indicates an expected call of GetTabletHealth.
+func (mr *MockHealthCheckMockRecorder) GetTabletHealth(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTabletHealth", reflect.TypeOf((*MockHealthCheck)(nil).GetTabletHealth), arg0, arg1)
+}
+
+// GetTabletHealthByAlias mocks base method.
+func (m *MockHealthCheck) GetTabletHealthByAlias(arg0 *topodata.TabletAlias) (*discovery.TabletHealth, error) {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "GetTabletHealthByAlias", arg0)
+ ret0, _ := ret[0].(*discovery.TabletHealth)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
+}
+
+// GetTabletHealthByAlias indicates an expected call of GetTabletHealthByAlias.
+func (mr *MockHealthCheckMockRecorder) GetTabletHealthByAlias(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetConnection", reflect.TypeOf((*MockHealthCheck)(nil).GetConnection), arg0)
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTabletHealthByAlias", reflect.TypeOf((*MockHealthCheck)(nil).GetTabletHealthByAlias), arg0)
}
// RegisterStats mocks base method.
@@ -110,43 +156,74 @@ func (m *MockHealthCheck) RemoveTablet(arg0 *topodata.Tablet) {
}
// RemoveTablet indicates an expected call of RemoveTablet.
-func (mr *MockHealthCheckMockRecorder) RemoveTablet(arg0 any) *gomock.Call {
+func (mr *MockHealthCheckMockRecorder) RemoveTablet(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveTablet", reflect.TypeOf((*MockHealthCheck)(nil).RemoveTablet), arg0)
}
// ReplaceTablet mocks base method.
-func (m *MockHealthCheck) ReplaceTablet(arg0, arg1 *topodata.Tablet, arg2 string) {
+func (m *MockHealthCheck) ReplaceTablet(arg0, arg1 *topodata.Tablet) {
m.ctrl.T.Helper()
- m.ctrl.Call(m, "ReplaceTablet", arg0, arg1, arg2)
+ m.ctrl.Call(m, "ReplaceTablet", arg0, arg1)
}
// ReplaceTablet indicates an expected call of ReplaceTablet.
-func (mr *MockHealthCheckMockRecorder) ReplaceTablet(arg0, arg1, arg2 any) *gomock.Call {
+func (mr *MockHealthCheckMockRecorder) ReplaceTablet(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReplaceTablet", reflect.TypeOf((*MockHealthCheck)(nil).ReplaceTablet), arg0, arg1)
+}
+
+// Subscribe mocks base method.
+func (m *MockHealthCheck) Subscribe() chan *discovery.TabletHealth {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "Subscribe")
+ ret0, _ := ret[0].(chan *discovery.TabletHealth)
+ return ret0
+}
+
+// Subscribe indicates an expected call of Subscribe.
+func (mr *MockHealthCheckMockRecorder) Subscribe() *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReplaceTablet", reflect.TypeOf((*MockHealthCheck)(nil).ReplaceTablet), arg0, arg1, arg2)
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Subscribe", reflect.TypeOf((*MockHealthCheck)(nil).Subscribe))
}
-// SetListener mocks base method.
-func (m *MockHealthCheck) SetListener(arg0 discovery.LegacyHealthCheckStatsListener, arg1 bool) {
+// TabletConnection mocks base method.
+func (m *MockHealthCheck) TabletConnection(arg0 *topodata.TabletAlias, arg1 *query.Target) (queryservice.QueryService, error) {
m.ctrl.T.Helper()
- m.ctrl.Call(m, "SetListener", arg0, arg1)
+ ret := m.ctrl.Call(m, "TabletConnection", arg0, arg1)
+ ret0, _ := ret[0].(queryservice.QueryService)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
}
-// SetListener indicates an expected call of SetListener.
-func (mr *MockHealthCheckMockRecorder) SetListener(arg0, arg1 any) *gomock.Call {
+// TabletConnection indicates an expected call of TabletConnection.
+func (mr *MockHealthCheckMockRecorder) TabletConnection(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetListener", reflect.TypeOf((*MockHealthCheck)(nil).SetListener), arg0, arg1)
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TabletConnection", reflect.TypeOf((*MockHealthCheck)(nil).TabletConnection), arg0, arg1)
}
-// WaitForInitialStatsUpdates mocks base method.
-func (m *MockHealthCheck) WaitForInitialStatsUpdates() {
+// Unsubscribe mocks base method.
+func (m *MockHealthCheck) Unsubscribe(arg0 chan *discovery.TabletHealth) {
m.ctrl.T.Helper()
- m.ctrl.Call(m, "WaitForInitialStatsUpdates")
+ m.ctrl.Call(m, "Unsubscribe", arg0)
+}
+
+// Unsubscribe indicates an expected call of Unsubscribe.
+func (mr *MockHealthCheckMockRecorder) Unsubscribe(arg0 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Unsubscribe", reflect.TypeOf((*MockHealthCheck)(nil).Unsubscribe), arg0)
+}
+
+// WaitForAllServingTablets mocks base method.
+func (m *MockHealthCheck) WaitForAllServingTablets(arg0 context.Context, arg1 []*query.Target) error {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "WaitForAllServingTablets", arg0, arg1)
+ ret0, _ := ret[0].(error)
+ return ret0
}
-// WaitForInitialStatsUpdates indicates an expected call of WaitForInitialStatsUpdates.
-func (mr *MockHealthCheckMockRecorder) WaitForInitialStatsUpdates() *gomock.Call {
+// WaitForAllServingTablets indicates an expected call of WaitForAllServingTablets.
+func (mr *MockHealthCheckMockRecorder) WaitForAllServingTablets(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForInitialStatsUpdates", reflect.TypeOf((*MockHealthCheck)(nil).WaitForInitialStatsUpdates))
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForAllServingTablets", reflect.TypeOf((*MockHealthCheck)(nil).WaitForAllServingTablets), arg0, arg1)
}
diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go
index a3da535037e..53b827d591a 100644
--- a/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go
+++ b/go/vt/vttablet/tabletserver/txthrottler/mock_throttler_test.go
@@ -78,13 +78,13 @@ func (mr *MockThrottlerInterfaceMockRecorder) MaxRate() *gomock.Call {
}
// RecordReplicationLag mocks base method.
-func (m *MockThrottlerInterface) RecordReplicationLag(arg0 time.Time, arg1 *discovery.LegacyTabletStats) {
+func (m *MockThrottlerInterface) RecordReplicationLag(arg0 time.Time, arg1 *discovery.TabletHealth) {
m.ctrl.T.Helper()
m.ctrl.Call(m, "RecordReplicationLag", arg0, arg1)
}
// RecordReplicationLag indicates an expected call of RecordReplicationLag.
-func (mr *MockThrottlerInterfaceMockRecorder) RecordReplicationLag(arg0, arg1 any) *gomock.Call {
+func (mr *MockThrottlerInterfaceMockRecorder) RecordReplicationLag(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RecordReplicationLag", reflect.TypeOf((*MockThrottlerInterface)(nil).RecordReplicationLag), arg0, arg1)
}
@@ -108,7 +108,7 @@ func (m *MockThrottlerInterface) SetMaxRate(arg0 int64) {
}
// SetMaxRate indicates an expected call of SetMaxRate.
-func (mr *MockThrottlerInterfaceMockRecorder) SetMaxRate(arg0 any) *gomock.Call {
+func (mr *MockThrottlerInterfaceMockRecorder) SetMaxRate(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetMaxRate", reflect.TypeOf((*MockThrottlerInterface)(nil).SetMaxRate), arg0)
}
@@ -120,7 +120,7 @@ func (m *MockThrottlerInterface) ThreadFinished(arg0 int) {
}
// ThreadFinished indicates an expected call of ThreadFinished.
-func (mr *MockThrottlerInterfaceMockRecorder) ThreadFinished(arg0 any) *gomock.Call {
+func (mr *MockThrottlerInterfaceMockRecorder) ThreadFinished(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ThreadFinished", reflect.TypeOf((*MockThrottlerInterface)(nil).ThreadFinished), arg0)
}
@@ -134,7 +134,7 @@ func (m *MockThrottlerInterface) Throttle(arg0 int) time.Duration {
}
// Throttle indicates an expected call of Throttle.
-func (mr *MockThrottlerInterfaceMockRecorder) Throttle(arg0 any) *gomock.Call {
+func (mr *MockThrottlerInterfaceMockRecorder) Throttle(arg0 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Throttle", reflect.TypeOf((*MockThrottlerInterface)(nil).Throttle), arg0)
}
@@ -148,7 +148,7 @@ func (m *MockThrottlerInterface) UpdateConfiguration(arg0 *throttlerdata.Configu
}
// UpdateConfiguration indicates an expected call of UpdateConfiguration.
-func (mr *MockThrottlerInterfaceMockRecorder) UpdateConfiguration(arg0, arg1 any) *gomock.Call {
+func (mr *MockThrottlerInterfaceMockRecorder) UpdateConfiguration(arg0, arg1 interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateConfiguration", reflect.TypeOf((*MockThrottlerInterface)(nil).UpdateConfiguration), arg0, arg1)
}
diff --git a/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go b/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go
index 3e2e6f803c2..5afb16d3473 100644
--- a/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go
+++ b/go/vt/vttablet/tabletserver/txthrottler/mock_topology_watcher_test.go
@@ -33,28 +33,26 @@ func (m *MockTopologyWatcherInterface) EXPECT() *MockTopologyWatcherInterfaceMoc
return m.recorder
}
-// Stop mocks base method.
-func (m *MockTopologyWatcherInterface) Stop() {
+// Start mocks base method.
+func (m *MockTopologyWatcherInterface) Start() {
m.ctrl.T.Helper()
- m.ctrl.Call(m, "Stop")
+ m.ctrl.Call(m, "Start")
}
-// Stop indicates an expected call of Stop.
-func (mr *MockTopologyWatcherInterfaceMockRecorder) Stop() *gomock.Call {
+// Start indicates an expected call of Start.
+func (mr *MockTopologyWatcherInterfaceMockRecorder) Start() *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Stop))
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Start", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Start))
}
-// WaitForInitialTopology mocks base method.
-func (m *MockTopologyWatcherInterface) WaitForInitialTopology() error {
+// Stop mocks base method.
+func (m *MockTopologyWatcherInterface) Stop() {
m.ctrl.T.Helper()
- ret := m.ctrl.Call(m, "WaitForInitialTopology")
- ret0, _ := ret[0].(error)
- return ret0
+ m.ctrl.Call(m, "Stop")
}
-// WaitForInitialTopology indicates an expected call of WaitForInitialTopology.
-func (mr *MockTopologyWatcherInterfaceMockRecorder) WaitForInitialTopology() *gomock.Call {
+// Stop indicates an expected call of Stop.
+func (mr *MockTopologyWatcherInterfaceMockRecorder) Stop() *gomock.Call {
mr.mock.ctrl.T.Helper()
- return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForInitialTopology", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).WaitForInitialTopology))
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockTopologyWatcherInterface)(nil).Stop))
}
diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go
index 217c3ac1f1e..6f0a10d1776 100644
--- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go
+++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go
@@ -18,6 +18,7 @@ package txthrottler
import (
"fmt"
+ "strings"
"sync"
"time"
@@ -40,7 +41,7 @@ import (
// TxThrottler throttles transactions based on replication lag.
// It's a thin wrapper around the throttler found in vitess/go/vt/throttler.
-// It uses a discovery.LegacyHealthCheck to send replication-lag updates to the wrapped throttler.
+// It uses a discovery.HealthCheck to send replication-lag updates to the wrapped throttler.
//
// Intended Usage:
// // Assuming topoServer is a topo.Server variable pointing to a Vitess topology server.
@@ -148,7 +149,7 @@ type ThrottlerInterface interface {
Close()
MaxRate() int64
SetMaxRate(rate int64)
- RecordReplicationLag(time time.Time, ts *discovery.LegacyTabletStats)
+ RecordReplicationLag(time time.Time, th *discovery.TabletHealth)
GetConfiguration() *throttlerdatapb.Configuration
UpdateConfiguration(configuration *throttlerdatapb.Configuration, copyZeroValues bool) error
ResetConfiguration()
@@ -158,7 +159,7 @@ type ThrottlerInterface interface {
// discovery.LegacyTopologyWatcher. It is only used here to allow mocking out
// go/vt/discovery.LegacyTopologyWatcher.
type TopologyWatcherInterface interface {
- WaitForInitialTopology() error
+ Start()
Stop()
}
@@ -166,18 +167,19 @@ type TopologyWatcherInterface interface {
type txThrottlerState struct {
// throttleMu serializes calls to throttler.Throttler.Throttle(threadId).
// That method is required to be called in serial for each threadId.
- throttleMu sync.Mutex
- throttler ThrottlerInterface
+ throttleMu sync.Mutex
+ throttler ThrottlerInterface
+ stopHealthCheck context.CancelFunc
- healthCheck discovery.LegacyHealthCheck
+ healthCheck discovery.HealthCheck
topologyWatchers []TopologyWatcherInterface
}
// These vars store the functions used to create the topo server, healthcheck,
// topology watchers and go/vt/throttler. These are provided here so that they can be overridden
// in tests to generate mocks.
-type healthCheckFactoryFunc func() discovery.LegacyHealthCheck
-type topologyWatcherFactoryFunc func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface
+type healthCheckFactoryFunc func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck
+type topologyWatcherFactoryFunc func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface
type throttlerFactoryFunc func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error)
var (
@@ -191,9 +193,11 @@ func init() {
}
func resetTxThrottlerFactories() {
- healthCheckFactory = discovery.NewLegacyDefaultHealthCheck
- topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface {
- return discovery.NewLegacyShardReplicationWatcher(context.Background(), topoServer, tr, cell, keyspace, shard, refreshInterval, topoReadConcurrency)
+ healthCheckFactory = func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck {
+ return discovery.NewHealthCheck(context.Background(), discovery.DefaultHealthCheckRetryDelay, discovery.DefaultHealthCheckTimeout, topoServer, cell, strings.Join(cellsToWatch, ","))
+ }
+ topologyWatcherFactory = func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface {
+ return discovery.NewCellTabletsWatcher(context.Background(), topoServer, hc, discovery.NewFilterByKeyspace([]string{keyspace}), cell, refreshInterval, true, topoReadConcurrency)
}
throttlerFactory = func(name, unit string, threadCount int, maxRate, maxReplicationLag int64) (ThrottlerInterface, error) {
return throttler.NewThrottler(name, unit, threadCount, maxRate, maxReplicationLag)
@@ -230,7 +234,7 @@ func (t *TxThrottler) Open() error {
}
log.Info("TxThrottler: opening")
var err error
- t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard)
+ t.state, err = newTxThrottlerState(t.config, t.target.Keyspace, t.target.Shard, t.target.Cell)
return err
}
@@ -263,8 +267,7 @@ func (t *TxThrottler) Throttle() (result bool) {
return t.state.throttle()
}
-func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string,
-) (*txThrottlerState, error) {
+func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard, cell string) (*txThrottlerState, error) {
t, err := throttlerFactory(
TxThrottlerName,
"TPS", /* unit */
@@ -281,8 +284,8 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string,
result := &txThrottlerState{
throttler: t,
}
- result.healthCheck = healthCheckFactory()
- result.healthCheck.SetListener(result, false /* sendDownEvents */)
+ createTxThrottlerHealthCheck(config, result, cell)
+
result.topologyWatchers = make(
[]TopologyWatcherInterface, 0, len(config.healthCheckCells))
for _, cell := range config.healthCheckCells {
@@ -290,7 +293,7 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string,
result.topologyWatchers,
topologyWatcherFactory(
config.topoServer,
- result.healthCheck, /* LegacyTabletRecorder */
+ result.healthCheck,
cell,
keyspace,
shard,
@@ -300,6 +303,23 @@ func newTxThrottlerState(config *txThrottlerConfig, keyspace, shard string,
return result, nil
}
+func createTxThrottlerHealthCheck(config *txThrottlerConfig, result *txThrottlerState, cell string) {
+ ctx, cancel := context.WithCancel(context.Background())
+ result.stopHealthCheck = cancel
+ result.healthCheck = healthCheckFactory(config.topoServer, cell, config.healthCheckCells)
+ ch := result.healthCheck.Subscribe()
+ go func(ctx context.Context) {
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case th := <-ch:
+ result.StatsUpdate(th)
+ }
+ }
+ }(ctx)
+}
+
func (ts *txThrottlerState) throttle() bool {
if ts.throttler == nil {
panic("BUG: throttle called after deallocateResources was called.")
@@ -328,8 +348,8 @@ func (ts *txThrottlerState) deallocateResources() {
ts.throttler = nil
}
-// StatsUpdate is part of the LegacyHealthCheckStatsListener interface.
-func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.LegacyTabletStats) {
+// StatsUpdate updates the health of a tablet with the given healthcheck.
+func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) {
// Ignore PRIMARY and RDONLY stats.
// We currently do not monitor RDONLY tablets for replication lag. RDONLY tablets are not
// candidates for becoming primary during failover, and it's acceptable to serve somewhat
diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go
index cefba6746d7..1606fa2cf4c 100644
--- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go
+++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go
@@ -17,7 +17,7 @@ limitations under the License.
package txthrottler
// Commands to generate the mocks for this test.
-//go:generate mockgen -destination mock_healthcheck_test.go -package txthrottler -mock_names "LegacyHealthCheck=MockHealthCheck" vitess.io/vitess/go/vt/discovery LegacyHealthCheck
+//go:generate mockgen -destination mock_healthcheck_test.go -package txthrottler -mock_names "HealthCheck=MockHealthCheck" vitess.io/vitess/go/vt/discovery HealthCheck
//go:generate mockgen -destination mock_throttler_test.go -package txthrottler vitess.io/vitess/go/vt/vttablet/tabletserver/txthrottler ThrottlerInterface
//go:generate mockgen -destination mock_topology_watcher_test.go -package txthrottler vitess.io/vitess/go/vt/vttablet/tabletserver/txthrottler TopologyWatcherInterface
@@ -61,17 +61,15 @@ func TestEnabledThrottler(t *testing.T) {
ts := memorytopo.NewServer("cell1", "cell2")
mockHealthCheck := NewMockHealthCheck(mockCtrl)
- var hcListener discovery.LegacyHealthCheckStatsListener
- hcCall1 := mockHealthCheck.EXPECT().SetListener(gomock.Any(), false /* sendDownEvents */)
- hcCall1.Do(func(listener discovery.LegacyHealthCheckStatsListener, sendDownEvents bool) {
- // Record the listener we're given.
- hcListener = listener
- })
+ hcCall1 := mockHealthCheck.EXPECT().Subscribe()
+ hcCall1.Do(func() {})
hcCall2 := mockHealthCheck.EXPECT().Close()
hcCall2.After(hcCall1)
- healthCheckFactory = func() discovery.LegacyHealthCheck { return mockHealthCheck }
+ healthCheckFactory = func(topoServer *topo.Server, cell string, cellsToWatch []string) discovery.HealthCheck {
+ return mockHealthCheck
+ }
- topologyWatcherFactory = func(topoServer *topo.Server, tr discovery.LegacyTabletRecorder, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface {
+ topologyWatcherFactory = func(topoServer *topo.Server, hc discovery.HealthCheck, cell, keyspace, shard string, refreshInterval time.Duration, topoReadConcurrency int) TopologyWatcherInterface {
if ts != topoServer {
t.Errorf("want: %v, got: %v", ts, topoServer)
}
@@ -100,7 +98,7 @@ func TestEnabledThrottler(t *testing.T) {
call0 := mockThrottler.EXPECT().UpdateConfiguration(gomock.Any(), true /* copyZeroValues */)
call1 := mockThrottler.EXPECT().Throttle(0)
call1.Return(0 * time.Second)
- tabletStats := &discovery.LegacyTabletStats{
+ tabletStats := &discovery.TabletHealth{
Target: &querypb.Target{
TabletType: topodatapb.TabletType_REPLICA,
},
@@ -132,14 +130,14 @@ func TestEnabledThrottler(t *testing.T) {
if result := throttler.Throttle(); result != false {
t.Errorf("want: false, got: %v", result)
}
- hcListener.StatsUpdate(tabletStats)
- rdonlyTabletStats := &discovery.LegacyTabletStats{
+ throttler.state.StatsUpdate(tabletStats)
+ rdonlyTabletStats := &discovery.TabletHealth{
Target: &querypb.Target{
TabletType: topodatapb.TabletType_RDONLY,
},
}
// This call should not be forwarded to the go/vt/throttler.Throttler object.
- hcListener.StatsUpdate(rdonlyTabletStats)
+ throttler.state.StatsUpdate(rdonlyTabletStats)
// The second throttle call should reject.
if result := throttler.Throttle(); result != true {
t.Errorf("want: true, got: %v", result)