diff --git a/go/vt/tabletmanager/agent.go b/go/vt/tabletmanager/agent.go index 968307fd343..765f8202021 100644 --- a/go/vt/tabletmanager/agent.go +++ b/go/vt/tabletmanager/agent.go @@ -91,6 +91,9 @@ type ActionAgent struct { // the reason we're not healthy. _healthy error + // this is the last time health check ran + _healthyTime time.Time + // replication delay the last time we got it _replicationDelay time.Duration @@ -247,10 +250,21 @@ func (agent *ActionAgent) Tablet() *topo.TabletInfo { } // Healthy reads the result of the latest healthcheck, protected by mutex. +// If that status is too old, it means healthcheck hasn't run for a while, +// and is probably stuck, this is not good, we're not healthy. func (agent *ActionAgent) Healthy() (time.Duration, error) { agent.mutex.Lock() defer agent.mutex.Unlock() - return agent._replicationDelay, agent._healthy + + healthy := agent._healthy + if healthy == nil { + timeSinceLastCheck := time.Now().Sub(agent._healthyTime) + if timeSinceLastCheck > *healthCheckInterval*3 { + healthy = fmt.Errorf("last health check is too old: %s > %s", timeSinceLastCheck, *healthCheckInterval*3) + } + } + + return agent._replicationDelay, healthy } // BlacklistedTables reads the list of blacklisted tables from the TabletControl diff --git a/go/vt/tabletmanager/healthcheck.go b/go/vt/tabletmanager/healthcheck.go index 5d8d676b7d3..bb602fcd4b6 100644 --- a/go/vt/tabletmanager/healthcheck.go +++ b/go/vt/tabletmanager/healthcheck.go @@ -228,6 +228,7 @@ func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) { // remember our health status agent.mutex.Lock() agent._healthy = err + agent._healthyTime = time.Now() agent._replicationDelay = replicationDelay agent.mutex.Unlock() diff --git a/go/vt/tabletmanager/healthcheck_test.go b/go/vt/tabletmanager/healthcheck_test.go index c38b67fb175..3bb1c33dd84 100644 --- a/go/vt/tabletmanager/healthcheck_test.go +++ b/go/vt/tabletmanager/healthcheck_test.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "html/template" + "strings" "testing" "time" @@ -153,6 +154,7 @@ func TestHealthCheckControlsQueryService(t *testing.T) { // first health check, should change us to replica, and update the // mysql port to 3306 + before := time.Now() agent.runHealthCheck(targetTabletType) ti, err := agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -167,9 +169,13 @@ func TestHealthCheckControlsQueryService(t *testing.T) { if !agent.QueryServiceControl.IsServing() { t.Errorf("Query service should be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } // now make the tablet unhealthy agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy") + before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -181,6 +187,9 @@ func TestHealthCheckControlsQueryService(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } } // TestQueryServiceNotStarting verifies that if a tablet cannot start the @@ -190,6 +199,7 @@ func TestQueryServiceNotStarting(t *testing.T) { targetTabletType := topo.TYPE_REPLICA agent.QueryServiceControl.(*tabletserver.TestQueryServiceControl).AllowQueriesError = fmt.Errorf("test cannot start query service") + before := time.Now() agent.runHealthCheck(targetTabletType) ti, err := agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -201,6 +211,9 @@ func TestQueryServiceNotStarting(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } } // TestQueryServiceStopped verifies that if a healthy tablet's query @@ -210,6 +223,7 @@ func TestQueryServiceStopped(t *testing.T) { targetTabletType := topo.TYPE_REPLICA // first health check, should change us to replica + before := time.Now() agent.runHealthCheck(targetTabletType) ti, err := agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -221,12 +235,16 @@ func TestQueryServiceStopped(t *testing.T) { if !agent.QueryServiceControl.IsServing() { t.Errorf("Query service should be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } // shut down query service and prevent it from starting again agent.QueryServiceControl.DisallowQueries() agent.QueryServiceControl.(*tabletserver.TestQueryServiceControl).AllowQueriesError = fmt.Errorf("test cannot start query service") // health check should now fail + before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -238,6 +256,9 @@ func TestQueryServiceStopped(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } } // TestTabletControl verifies the shard's TabletControl record can disable @@ -247,6 +268,7 @@ func TestTabletControl(t *testing.T) { targetTabletType := topo.TYPE_REPLICA // first health check, should change us to replica + before := time.Now() agent.runHealthCheck(targetTabletType) ti, err := agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -258,6 +280,9 @@ func TestTabletControl(t *testing.T) { if !agent.QueryServiceControl.IsServing() { t.Errorf("Query service should be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } // now update the shard si, err := agent.TopoServer.GetShard(keyspace, shard) @@ -286,6 +311,7 @@ func TestTabletControl(t *testing.T) { } // check running a health check will not start it again + before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -297,9 +323,13 @@ func TestTabletControl(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } // go unhealthy, check we go to spare and QS is not running agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy") + before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -311,9 +341,13 @@ func TestTabletControl(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } // go back healthy, check QS is still not running agent.HealthReporter.(*fakeHealthCheck).reportError = nil + before = time.Now() agent.runHealthCheck(targetTabletType) ti, err = agent.TopoServer.GetTablet(tabletAlias) if err != nil { @@ -325,4 +359,33 @@ func TestTabletControl(t *testing.T) { if agent.QueryServiceControl.IsServing() { t.Errorf("Query service should not be running") } + if agent._healthyTime.Sub(before) < 0 { + t.Errorf("runHealthCheck did not update agent._healthyTime") + } +} + +// TestOldHealthCheck verifies that a healthcheck that is too old will +// return an error +func TestOldHealthCheck(t *testing.T) { + agent := createTestAgent(t) + *healthCheckInterval = 20 * time.Second + agent._healthy = nil + + // last health check time is now, we're good + agent._healthyTime = time.Now() + if _, healthy := agent.Healthy(); healthy != nil { + t.Errorf("Healthy returned unexpected error: %v", healthy) + } + + // last health check time is 2x interval ago, we're good + agent._healthyTime = time.Now().Add(-2 * *healthCheckInterval) + if _, healthy := agent.Healthy(); healthy != nil { + t.Errorf("Healthy returned unexpected error: %v", healthy) + } + + // last health check time is 4x interval ago, we're not good + agent._healthyTime = time.Now().Add(-4 * *healthCheckInterval) + if _, healthy := agent.Healthy(); healthy == nil || !strings.Contains(healthy.Error(), "last health check is too old") { + t.Errorf("Healthy returned wrong error: %v", healthy) + } }