Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion go/vt/tabletmanager/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ type ActionAgent struct {
// the reason we're not healthy.
_healthy error

// this is the last time health check ran
_healthyTime time.Time

// replication delay the last time we got it
_replicationDelay time.Duration

Expand Down Expand Up @@ -247,10 +250,21 @@ func (agent *ActionAgent) Tablet() *topo.TabletInfo {
}

// Healthy reads the result of the latest healthcheck, protected by mutex.
// If that status is too old, it means healthcheck hasn't run for a while,
// and is probably stuck, this is not good, we're not healthy.
func (agent *ActionAgent) Healthy() (time.Duration, error) {
agent.mutex.Lock()
defer agent.mutex.Unlock()
return agent._replicationDelay, agent._healthy

healthy := agent._healthy
if healthy == nil {
timeSinceLastCheck := time.Now().Sub(agent._healthyTime)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we use UTC time instead ? I have some past experience that a local time may be messed up by daylight saving. In this case, it may cause "last health check is too old".

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After checking the code, the internal storage of time is always in UTC, the display only compensates for time zone. so this is already the case.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a fan of time.Since(t) which is a shortcut for time.Now().Sub(t). It reads really nicely.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, fixed that in a different PR.

if timeSinceLastCheck > *healthCheckInterval*3 {
healthy = fmt.Errorf("last health check is too old: %s > %s", timeSinceLastCheck, *healthCheckInterval*3)
}
}

return agent._replicationDelay, healthy
}

// BlacklistedTables reads the list of blacklisted tables from the TabletControl
Expand Down
1 change: 1 addition & 0 deletions go/vt/tabletmanager/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ func (agent *ActionAgent) runHealthCheck(targetTabletType topo.TabletType) {
// remember our health status
agent.mutex.Lock()
agent._healthy = err
agent._healthyTime = time.Now()
agent._replicationDelay = replicationDelay
agent.mutex.Unlock()

Expand Down
63 changes: 63 additions & 0 deletions go/vt/tabletmanager/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"html/template"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -153,6 +154,7 @@ func TestHealthCheckControlsQueryService(t *testing.T) {

// first health check, should change us to replica, and update the
// mysql port to 3306
before := time.Now()
agent.runHealthCheck(targetTabletType)
ti, err := agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -167,9 +169,13 @@ func TestHealthCheckControlsQueryService(t *testing.T) {
if !agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}

// now make the tablet unhealthy
agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy")
before = time.Now()
agent.runHealthCheck(targetTabletType)
ti, err = agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -181,6 +187,9 @@ func TestHealthCheckControlsQueryService(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
}

// TestQueryServiceNotStarting verifies that if a tablet cannot start the
Expand All @@ -190,6 +199,7 @@ func TestQueryServiceNotStarting(t *testing.T) {
targetTabletType := topo.TYPE_REPLICA
agent.QueryServiceControl.(*tabletserver.TestQueryServiceControl).AllowQueriesError = fmt.Errorf("test cannot start query service")

before := time.Now()
agent.runHealthCheck(targetTabletType)
ti, err := agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -201,6 +211,9 @@ func TestQueryServiceNotStarting(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
}

// TestQueryServiceStopped verifies that if a healthy tablet's query
Expand All @@ -210,6 +223,7 @@ func TestQueryServiceStopped(t *testing.T) {
targetTabletType := topo.TYPE_REPLICA

// first health check, should change us to replica
before := time.Now()
agent.runHealthCheck(targetTabletType)
ti, err := agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -221,12 +235,16 @@ func TestQueryServiceStopped(t *testing.T) {
if !agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}

// shut down query service and prevent it from starting again
agent.QueryServiceControl.DisallowQueries()
agent.QueryServiceControl.(*tabletserver.TestQueryServiceControl).AllowQueriesError = fmt.Errorf("test cannot start query service")

// health check should now fail
before = time.Now()
agent.runHealthCheck(targetTabletType)
ti, err = agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -238,6 +256,9 @@ func TestQueryServiceStopped(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
}

// TestTabletControl verifies the shard's TabletControl record can disable
Expand All @@ -247,6 +268,7 @@ func TestTabletControl(t *testing.T) {
targetTabletType := topo.TYPE_REPLICA

// first health check, should change us to replica
before := time.Now()
agent.runHealthCheck(targetTabletType)
ti, err := agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -258,6 +280,9 @@ func TestTabletControl(t *testing.T) {
if !agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}

// now update the shard
si, err := agent.TopoServer.GetShard(keyspace, shard)
Expand Down Expand Up @@ -286,6 +311,7 @@ func TestTabletControl(t *testing.T) {
}

// check running a health check will not start it again
before = time.Now()
agent.runHealthCheck(targetTabletType)
ti, err = agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -297,9 +323,13 @@ func TestTabletControl(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}

// go unhealthy, check we go to spare and QS is not running
agent.HealthReporter.(*fakeHealthCheck).reportError = fmt.Errorf("tablet is unhealthy")
before = time.Now()
agent.runHealthCheck(targetTabletType)
ti, err = agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -311,9 +341,13 @@ func TestTabletControl(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}

// go back healthy, check QS is still not running
agent.HealthReporter.(*fakeHealthCheck).reportError = nil
before = time.Now()
agent.runHealthCheck(targetTabletType)
ti, err = agent.TopoServer.GetTablet(tabletAlias)
if err != nil {
Expand All @@ -325,4 +359,33 @@ func TestTabletControl(t *testing.T) {
if agent.QueryServiceControl.IsServing() {
t.Errorf("Query service should not be running")
}
if agent._healthyTime.Sub(before) < 0 {
t.Errorf("runHealthCheck did not update agent._healthyTime")
}
}

// TestOldHealthCheck verifies that a healthcheck that is too old will
// return an error
func TestOldHealthCheck(t *testing.T) {
agent := createTestAgent(t)
*healthCheckInterval = 20 * time.Second
agent._healthy = nil

// last health check time is now, we're good
agent._healthyTime = time.Now()
if _, healthy := agent.Healthy(); healthy != nil {
t.Errorf("Healthy returned unexpected error: %v", healthy)
}

// last health check time is 2x interval ago, we're good
agent._healthyTime = time.Now().Add(-2 * *healthCheckInterval)
if _, healthy := agent.Healthy(); healthy != nil {
t.Errorf("Healthy returned unexpected error: %v", healthy)
}

// last health check time is 4x interval ago, we're not good
agent._healthyTime = time.Now().Add(-4 * *healthCheckInterval)
if _, healthy := agent.Healthy(); healthy == nil || !strings.Contains(healthy.Error(), "last health check is too old") {
t.Errorf("Healthy returned wrong error: %v", healthy)
}
}