diff --git a/go.mod b/go.mod index df82903f..5e0bdee8 100644 --- a/go.mod +++ b/go.mod @@ -33,7 +33,7 @@ require ( go.opencensus.io v0.0.0-20181129005706-8b019f31bc1c // indirect golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 golang.org/x/oauth2 v0.0.0-20181128211412-28207608b838 - golang.org/x/sync v0.0.0-20181108010431-42b317875d0f // indirect + golang.org/x/sync v0.0.0-20181108010431-42b317875d0f google.golang.org/api v0.0.0-20181129220737-af4fc4062c26 // indirect google.golang.org/appengine v1.3.0 // indirect google.golang.org/genproto v0.0.0-20181202183823-bd91e49a0898 // indirect diff --git a/infra/gravity/cluster_install.go b/infra/gravity/cluster_install.go index c44cc85c..a295ffc1 100644 --- a/infra/gravity/cluster_install.go +++ b/infra/gravity/cluster_install.go @@ -205,7 +205,7 @@ func (c *TestContext) uploadInstaller(master Gravity, nodes []Gravity, installer return trace.Wrap(err) } - err = c.Status(nodes) + err = c.WaitForActiveStatus(nodes) if err != nil { return trace.Wrap(err) } @@ -232,7 +232,7 @@ func (c *TestContext) upgrade(master Gravity, numNodes int) error { // ExecScript will run and execute a script on all nodes func (c *TestContext) ExecScript(nodes []Gravity, scriptUrl string, args []string) error { - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.ExecScript) defer cancel() errs := make(chan error, len(nodes)) diff --git a/infra/gravity/cluster_resize.go b/infra/gravity/cluster_resize.go index 25eed157..920a8d8b 100644 --- a/infra/gravity/cluster_resize.go +++ b/infra/gravity/cluster_resize.go @@ -18,7 +18,7 @@ func (c *TestContext) Expand(currentCluster, nodesToJoin []Gravity, p InstallPar // status is solely used for gathering the join token, can this be replaced // with InstallParam.Token -- 2020-05 walt peer := currentCluster[0] - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.NodeStatus) defer cancel() status, err := peer.Status(ctx) if err != nil { @@ -41,7 +41,7 @@ func (c *TestContext) Expand(currentCluster, nodesToJoin []Gravity, p InstallPar // JoinNode has one node join a peer already in a cluster func (c *TestContext) JoinNode(peer, nodeToJoin Gravity, p InstallParam) error { - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.NodeStatus) defer cancel() status, err := peer.Status(ctx) if err != nil { diff --git a/infra/gravity/cluster_status.go b/infra/gravity/cluster_status.go index e761a4dc..cd3518d8 100644 --- a/infra/gravity/cluster_status.go +++ b/infra/gravity/cluster_status.go @@ -2,45 +2,105 @@ package gravity import ( "context" - "time" + "github.com/cenkalti/backoff" + "golang.org/x/sync/errgroup" + + "github.com/gravitational/robotest/lib/constants" sshutils "github.com/gravitational/robotest/lib/ssh" "github.com/gravitational/robotest/lib/utils" "github.com/gravitational/robotest/lib/wait" - "github.com/gravitational/trace" ) -// Status walks around all nodes and checks whether they all feel OK -func (c *TestContext) Status(nodes []Gravity) error { - c.Logger().WithField("nodes", Nodes(nodes)).Info("Check status on nodes.") - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) - defer cancel() +// statusValidator returns nil if the Gravity Status is the expected status or an error otherwise. +type statusValidator func(s GravityStatus) error - retry := wait.Retryer{ - Attempts: 100, - Delay: time.Second * 20, +// checkNotDegraded returns an error if the cluster status is Degraded. +// +// This function is a reimplementation of the logic in https://github.com/gravitational/gravity/blob/7.0.0/lib/status/status.go#L180-L185 +func checkNotDegraded(s GravityStatus) error { + if s.Cluster.State == constants.ClusterStateDegraded { + return trace.CompareFailed("cluster state %q", s.Cluster.State) } + if s.Cluster.SystemStatus != constants.SystemStatus_Running { + return trace.CompareFailed("expected system_status %v, found %v", constants.SystemStatus_Running, s.Cluster.SystemStatus) + } + return nil +} - err := retry.Do(ctx, func() error { - errs := make(chan error, len(nodes)) +// checkActive returns an error if the cluster is degraded or state != active. +func checkActive(s GravityStatus) error { + if err := checkNotDegraded(s); err != nil { + return trace.Wrap(err) + } + if s.Cluster.State != constants.ClusterStateActive { + return trace.CompareFailed("expected state %q, found %q", constants.ClusterStateActive, s.Cluster.State) + } + return nil +} - for _, node := range nodes { - go func(n Gravity) { - _, err := n.Status(ctx) - errs <- err - }(node) - } +// WaitForActiveStatus blocks until all nodes report state = Active and notDegraded or an internal timeout expires. +func (c *TestContext) WaitForActiveStatus(nodes []Gravity) error { + c.Logger().WithField("nodes", Nodes(nodes)).Info("Waiting for active status.") + return c.WaitForStatus(nodes, checkActive) +} - err := utils.CollectErrors(ctx, errs) - if err == nil { - return nil +// WaitForStatus blocks until all nodes satisfy the expected statusValidator or an internal timeout expires. +func (c *TestContext) WaitForStatus(nodes []Gravity, expected statusValidator) error { + b := backoff.NewExponentialBackOff() + b.MaxElapsedTime = c.timeouts.ClusterStatus + + expectStatus := func() (err error) { + statuses, err := c.Status(nodes) + if err != nil { + return trace.Wrap(err) + } + for _, status := range statuses { + err = expected(status) + if err != nil { + c.Logger().WithError(err).WithField("status", status).Warn("Unexpected Status.") + return trace.Wrap(err) + } } - c.Logger().Warnf("Status not available on some nodes, will retry: %v.", err) - return wait.Continue("status not ready on some nodes") - }) + return nil + } + + err := wait.RetryWithInterval(c.ctx, b, expectStatus, c.Logger()) return trace.Wrap(err) + +} + +// Status queries `gravity status` once from each node in nodes. +func (c *TestContext) Status(nodes []Gravity) (statuses []GravityStatus, err error) { + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.NodeStatus) + defer cancel() + + valueC := make(chan GravityStatus, len(nodes)) + g, ctx := errgroup.WithContext(ctx) + for _, node := range nodes { + node := node + g.Go(func() error { + status, err := node.Status(ctx) + if err != nil { + return trace.Wrap(err) + } + if status != nil { + valueC <- *status + } + return nil + }) + } + err = g.Wait() + if err != nil { + return nil, trace.Wrap(err) + } + close(valueC) + for status := range valueC { + statuses = append(statuses, status) + } + return statuses, nil } // CheckTime walks around all nodes and checks whether their time is within acceptable limits @@ -53,9 +113,8 @@ func (c *TestContext) CheckTimeSync(nodes []Gravity) error { }) } - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.TimeSync) defer cancel() - err := sshutils.CheckTimeSync(ctx, timeNodes) return trace.Wrap(err) } @@ -122,7 +181,7 @@ type ClusterNodesByRole struct { // NodesByRole will conveniently organize nodes according to their roles in cluster func (c *TestContext) NodesByRole(nodes []Gravity) (roles *ClusterNodesByRole, err error) { - ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.Status) + ctx, cancel := context.WithTimeout(c.ctx, c.timeouts.ResolveInPlanet) defer cancel() roles = &ClusterNodesByRole{} @@ -131,6 +190,8 @@ func (c *TestContext) NodesByRole(nodes []Gravity) (roles *ClusterNodesByRole, e return nil, trace.Wrap(err) } + ctx, cancel = context.WithTimeout(c.ctx, c.timeouts.GetPods) + defer cancel() // Run query on the apiserver pods, err := KubectlGetPods(ctx, roles.ApiMaster, kubeSystemNS, appGravityLabel) if err != nil { diff --git a/infra/gravity/defaults.go b/infra/gravity/defaults.go index 2eb78477..c169d50c 100644 --- a/infra/gravity/defaults.go +++ b/infra/gravity/defaults.go @@ -30,9 +30,14 @@ var DefaultTimeouts = OpTimeouts{ Upgrade: time.Minute * 30, // upgrade threshold per node Uninstall: time.Minute * 5, // uninstall threshold per node UninstallApp: time.Minute * 5, // application uninstall threshold - Status: time.Minute * 30, // sufficient for failover procedures + NodeStatus: time.Minute * 1, // limit for status to return on a single node + ClusterStatus: time.Minute * 5, // limit for status to queisce across the cluster Leave: time.Minute * 15, // threshold to leave cluster CollectLogs: time.Minute * 7, // to collect logs from node WaitForInstaller: time.Minute * 30, // wait for build to complete in parallel AutoScaling: time.Minute * 10, // wait for autoscaling operation + TimeSync: time.Minute * 5, // wait for ntp to converge + ResolveInPlanet: time.Minute * 1, // resolve a hostname inside planet with dig + GetPods: time.Minute * 1, // use kubectl to query pods on the API master + ExecScript: time.Minute * 5, // user provided script, this should be specified by the user } diff --git a/infra/gravity/gravity_test.go b/infra/gravity/gravity_test.go index 8c6cafba..c1a9d329 100644 --- a/infra/gravity/gravity_test.go +++ b/infra/gravity/gravity_test.go @@ -3,22 +3,23 @@ package gravity import ( "bufio" "bytes" + "os" "testing" "github.com/stretchr/testify/assert" ) -var testStatusStr = []byte(` +func TestGravityOutput(t *testing.T) { + var testStatusStr = []byte(` {"cluster":{"application":{"repository":"gravitational.io","name":"telekube","version":"0.0.1"},"state":"active","domain":"testcluster","token":{"token":"fac3b88014367fe4e98a8664755e2be4","expires":"0001-01-01T00:00:00Z","type":"expand","account_id":"00000000-0000-0000-0000-000000000001","site_domain":"testcluster","operation_id":"","user_email":"agent@testcluster"},"operation":{"type":"operation_install","id":"55298dfd-2094-47a3-a787-8b2a546c0fd1","state":"completed","created":"2008-01-01T12:00:00.0Z","progress":{"message":"Operation has completed","completion":100,"created":"2008-01-01T12:05:00.0Z"}},"system_status":1,"nodes":[{"hostname":"node-0","advertise_ip":"10.40.2.4","role":"master","profile":"node","status":"healthy"},{"hostname":"node-2","advertise_ip":"10.40.2.5","role":"master","profile":"node","status":"healthy"},{"hostname":"node-1","advertise_ip":"10.40.2.7","role":"master","profile":"node","status":"healthy"},{"hostname":"node-5","advertise_ip":"10.40.2.6","role":"node","profile":"node","status":"healthy"},{"hostname":"node-3","advertise_ip":"10.40.2.3","role":"node","profile":"node","status":"healthy"},{"hostname":"node-4","advertise_ip":"10.40.2.2","role":"node","profile":"node","status":"healthy"}]}} `) - -func TestGravityOutput(t *testing.T) { expectedStatus := &GravityStatus{ Cluster: ClusterStatus{ - Cluster: "testcluster", - Application: Application{Name: "telekube"}, - Status: "active", - Token: Token{Token: "fac3b88014367fe4e98a8664755e2be4"}, + Cluster: "testcluster", + Application: Application{Name: "telekube"}, + State: "active", + SystemStatus: 1, + Token: Token{Token: "fac3b88014367fe4e98a8664755e2be4"}, Nodes: []NodeStatus{ NodeStatus{Addr: "10.40.2.4"}, NodeStatus{Addr: "10.40.2.5"}, @@ -35,3 +36,60 @@ func TestGravityOutput(t *testing.T) { assert.NoError(t, err) assert.Equal(t, expectedStatus, &status, "parseStatus") } + +func TestHealthyStatusValidation(t *testing.T) { + healthyStatus := GravityStatus{ + Cluster: ClusterStatus{ + Cluster: "robotest", + Application: Application{Name: "telekube"}, + State: "active", + SystemStatus: 1, + Token: Token{Token: "ROBOTEST"}, + Nodes: []NodeStatus{ + NodeStatus{Addr: "10.1.2.3"}, + NodeStatus{Addr: "10.1.2.4"}, + NodeStatus{Addr: "10.1.2.5"}, + }, + }, + } + err := checkActive(healthyStatus) + assert.NoError(t, err) +} + +// Test1523StatusValidation ensures expanding status is +// identified as "unsafe to proceed" by Robotest. +// +// Expands may be unexpectedly seen after install as discussed +// in https://github.com/gravitational/gravity/issues/1523. +func Test1523StatusValidation(t *testing.T) { + nonActiveStatus := GravityStatus{ + Cluster: ClusterStatus{ + Cluster: "robotest", + Application: Application{Name: "telekube"}, + State: "expanding", + Token: Token{Token: "ROBOTEST"}, + Nodes: []NodeStatus{ + NodeStatus{Addr: "10.1.2.3"}, + }, + }, + } + err := checkActive(nonActiveStatus) + assert.Error(t, err) +} + +// Test1641StatusValidation ensures a particular status type seen +// in the field identified as degraded by Robotest. +// +// See https://github.com/gravitational/gravity/issues/1641 for more info. +func Test1641StatusValidation(t *testing.T) { + f, err := os.Open("testdata/status-degraded-1641.json") + assert.NoError(t, err) + defer f.Close() + + var status GravityStatus + err = parseStatus(&status)(bufio.NewReader(f)) + assert.NoError(t, err) + + err = checkNotDegraded(status) + assert.Error(t, err) +} diff --git a/infra/gravity/node_commands.go b/infra/gravity/node_commands.go index 4833f8f1..591bc871 100644 --- a/infra/gravity/node_commands.go +++ b/infra/gravity/node_commands.go @@ -125,11 +125,6 @@ type JoinCmd struct { StateDir string } -// IsDegraded determines whether the cluster is in degraded state -func (r GravityStatus) IsDegraded() bool { - return r.Cluster.Status == "degraded" -} - // GravityStatus describes the status of the Gravity cluster type GravityStatus struct { // Cluster describes the cluster status @@ -142,8 +137,10 @@ type ClusterStatus struct { Application Application `json:"application"` // Cluster is the name of the cluster Cluster string `json:"domain"` - // Status is the cluster status - Status string `json:"state"` + // State is the cluster state + State string `json:"state"` + // SystemStatus is the cluster status, see https://github.com/gravitational/satellite/blob/7.1.0/agent/proto/agentpb/agent.proto#L50-L54 + SystemStatus int `json:"system_status"` // Token is secure token which prevents rogue nodes from joining the cluster during installation Token Token `json:"token"` // Nodes describes the nodes in the cluster @@ -284,26 +281,7 @@ var installCmdTemplate = template.Must( `)) // Status queries cluster status -func (g *gravity) Status(ctx context.Context) (status *GravityStatus, err error) { - b := backoff.NewExponentialBackOff() - b.MaxElapsedTime = defaults.ClusterStatusTimeout - err = wait.RetryWithInterval(ctx, b, func() (err error) { - status, err = g.status(ctx) - if err != nil { - return trace.Wrap(err) - } - if status.IsDegraded() { - return trace.BadParameter("degraded") - } - return nil - }, g.log) - if err != nil { - return nil, trace.Wrap(err) - } - return status, nil -} - -func (g *gravity) status(ctx context.Context) (*GravityStatus, error) { +func (g *gravity) Status(ctx context.Context) (*GravityStatus, error) { cmd := fmt.Sprintf("sudo gravity status --output=json --system-log-file=%v", defaults.AgentLogPath) status := GravityStatus{} diff --git a/infra/gravity/testcontext.go b/infra/gravity/testcontext.go index de06032d..03ff7076 100644 --- a/infra/gravity/testcontext.go +++ b/infra/gravity/testcontext.go @@ -20,17 +20,22 @@ const ( // OpTimeouts defines per-node, per-operation timeouts which would be used to determine // whether test must be failed -// provisioner has its own timeout / restart logic which is dependant on cloud provider and terraform +// provisioner has its own timeout / restart logic which is dependent on cloud provider and terraform type OpTimeouts struct { Install time.Duration Upgrade time.Duration - Status time.Duration + NodeStatus time.Duration + ClusterStatus time.Duration Uninstall time.Duration UninstallApp time.Duration Leave time.Duration CollectLogs time.Duration WaitForInstaller time.Duration AutoScaling time.Duration + TimeSync time.Duration + ResolveInPlanet time.Duration + GetPods time.Duration + ExecScript time.Duration } // TestContext aggregates common parameters for better test suite readability diff --git a/infra/gravity/testdata/status-degraded-1641.json b/infra/gravity/testdata/status-degraded-1641.json new file mode 100644 index 00000000..feb516b8 --- /dev/null +++ b/infra/gravity/testdata/status-degraded-1641.json @@ -0,0 +1,129 @@ +{ + "cluster": { + "application": { + "repository": "gravitational.io", + "name": "telekube", + "version": "6.1.27" + }, + "state": "active", + "domain": "robotest-unit-test", + "token": { + "token": "ROBOTEST", + "expires": "0001-01-01T00:00:00Z", + "type": "expand", + "account_id": "00000000-0000-0000-0000-000000000001", + "site_domain": "robotest-unit-test", + "operation_id": "42178dd3-291c-4dd7-a870-87a1ade5d93a", + "user_email": "agent@robotest-unit-test" + }, + "operation": { + "type": "operation_install", + "id": "42178dd3-291c-4dd7-a870-87a1ade5d93a", + "state": "completed", + "created": "2020-05-27T21:04:41.962836059Z", + "description": "3-node install", + "progress": { + "message": "Operation has completed", + "completion": 100, + "created": "2020-05-27T21:05:20.934000242Z" + } + }, + "endpoints": { + "applications": { + "Endpoints": [ + { + "application": { + "repository": "gravitational.io", + "name": "telekube", + "version": "6.1.27" + }, + "endpoints": [ + { + "name": "Gravity Control Panel", + "description": "Local administrative user interface of this Gravity cluster\n", + "addresses": [ + "https://10.138.0.23:32009", + "https://10.138.0.53:32009", + "https://10.138.0.56:32009" + ] + } + ] + } + ] + }, + "cluster": { + "auth_gateway": [ + "10.138.0.56:32009", + "10.138.0.23:32009", + "10.138.0.53:32009" + ], + "ui": [ + "https://10.138.0.56:32009", + "https://10.138.0.23:32009", + "https://10.138.0.53:32009" + ] + } + }, + "Extension": {}, + "server_version": { + "edition": "open-source", + "version": "6.1.27", + "gitCommit": "1787b91072d649e6fcb761c803a8690d432e07d6", + "helm": "v2.14" + }, + "client_version": { + "edition": "open-source", + "version": "6.1.27", + "gitCommit": "1787b91072d649e6fcb761c803a8690d432e07d6", + "helm": "v2.14" + }, + "system_status": 2, + "nodes": [ + { + "hostname": "robotest-unit-test-node-1", + "advertise_ip": "10.138.0.56", + "role": "master", + "profile": "node", + "status": "degraded", + "failed_probes": [ + "etcd-healthz (unexpected HTTP status: Service Unavailable)" + ], + "teleport_node": { + "hostname": "robotest-unit-test-node-1", + "advertise_ip": "10.138.0.56", + "public_ip": "", + "profile": "node", + "instance_type": "" + } + }, + { + "hostname": "robotest-unit-test-node-0", + "advertise_ip": "10.138.0.23", + "role": "master", + "profile": "node", + "status": "healthy", + "teleport_node": { + "hostname": "robotest-unit-test-node-0", + "advertise_ip": "10.138.0.23", + "public_ip": "", + "profile": "node", + "instance_type": "" + } + }, + { + "hostname": "robotest-unit-test-node-2", + "advertise_ip": "10.138.0.53", + "role": "master", + "profile": "node", + "status": "healthy", + "teleport_node": { + "hostname": "robotest-unit-test-node-2", + "advertise_ip": "10.138.0.53", + "public_ip": "", + "profile": "node", + "instance_type": "" + } + } + ] + } +} diff --git a/lib/constants/constants.go b/lib/constants/constants.go index 2ad32827..0b9828c1 100644 --- a/lib/constants/constants.go +++ b/lib/constants/constants.go @@ -54,3 +54,19 @@ const ( // Ops specifies a special cloud provider - a telekube Ops Center Ops = "ops" ) + +// Gravity API constants redeclared here to avoid extra dependencies solely to +// get these values +const ( + // ClusterState* consts come from https://github.com/gravitational/gravity/blob/7.0.0/lib/ops/constants.go#L64-L93 + + // ClusterStateActive is healthy and not running any operations. + ClusterStateActive = "active" + // ClusterStateDegraded is unhealthy. + ClusterStateDegraded = "degraded" + + // SystemStatus_* consts come from https://github.com/gravitational/satellite/blob/7.1.0/agent/proto/agentpb/agent.pb.go#L28-L32 + + SystemStatus_Running = 1 + SystemStatus_Degraded = 2 +) diff --git a/lib/defaults/defaults.go b/lib/defaults/defaults.go index 5b780557..3ce992ae 100644 --- a/lib/defaults/defaults.go +++ b/lib/defaults/defaults.go @@ -40,9 +40,6 @@ const ( // TerraformRetryDelay TerraformRetryDelay = 5 * time.Minute - // ClusterStatusTimeout specifies the maximum amount of time to wait for cluster status - ClusterStatusTimeout = 5 * time.Minute - // TerraformRetries is the maximum number of attempts to reprovision the // infrastructure upon encountering an error from 'terraform apply' TerraformRetries = 2 diff --git a/suite/sanity/autoscale.go b/suite/sanity/autoscale.go index b7a4a690..d08eaaf1 100644 --- a/suite/sanity/autoscale.go +++ b/suite/sanity/autoscale.go @@ -15,19 +15,20 @@ func autoscale(p interface{}) (gravity.TestFunc, error) { g.Maybe("destroy", cluster.Destroy()) }() - g.OK("status", g.Status(cluster.Nodes)) + g.OK("wait for active status on masters", g.WaitForActiveStatus(cluster.Nodes)) g.OK("time sync", g.CheckTimeSync(cluster.Nodes)) // Scale Up workers, err := g.AutoScale(3) g.OK("asg-up", err) - g.OK("status-masters", g.Status(cluster.Nodes)) - g.OK("status-workers", g.Status(workers)) + g.OK("wait for active status on masters", g.WaitForActiveStatus(cluster.Nodes)) + g.OK("wait for active status on asg workers", g.WaitForActiveStatus(workers)) // Scale Down workers, err = g.AutoScale(1) g.OK("asg-down", err) - g.OK("status-masters", g.Status(cluster.Nodes)) - g.OK("status-workers", g.Status(workers)) + g.OK("wait for active status on masters", g.WaitForActiveStatus(cluster.Nodes)) + _, err = g.Status(workers) + g.OK("status on asg workers", err) }, nil } diff --git a/suite/sanity/install.go b/suite/sanity/install.go index f7757e76..643adb6b 100644 --- a/suite/sanity/install.go +++ b/suite/sanity/install.go @@ -62,7 +62,7 @@ func install(p interface{}) (gravity.TestFunc, error) { g.ExecScript(cluster.Nodes, param.Script.Url, param.Script.Args)) } g.OK("application installed", g.OfflineInstall(cluster.Nodes, param.InstallParam)) - g.OK("status", g.Status(cluster.Nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(cluster.Nodes)) }, nil } diff --git a/suite/sanity/loss_recover.go b/suite/sanity/loss_recover.go index c3c6f2a9..7ebd5bf9 100644 --- a/suite/sanity/loss_recover.go +++ b/suite/sanity/loss_recover.go @@ -84,13 +84,13 @@ func lossAndRecovery(p interface{}) (gravity.TestFunc, error) { nodes := cluster.Nodes[0:param.NodeCount] g.OK("install", g.OfflineInstall(nodes, param.InstallParam)) - g.OK("install status", g.Status(nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(nodes)) nodes, removed, err := removeNode(g, nodes, param.ReplaceNodeType, param.PowerOff) g.OK(fmt.Sprintf("node for removal=%v, poweroff=%v", removed, param.PowerOff), err) now := time.Now() - g.OK("wait for cluster to be ready", g.Status(nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(nodes)) g.Logger().WithFields(logrus.Fields{"nodes": nodes, "elapsed": fmt.Sprintf("%v", time.Since(now))}). Info("cluster is available") @@ -105,10 +105,10 @@ func lossAndRecovery(p interface{}) (gravity.TestFunc, error) { Info("roles after expand") g.OK("remove node", g.RemoveNode(nodes[0], removed)) - g.OK("remove status", g.Status(nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(nodes)) } else { g.OK("remove lost node", g.RemoveNode(nodes[0], removed)) - g.OK("remove status", g.Status(nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(nodes)) roles, err := g.NodesByRole(nodes) g.OK("node role after remove", err) diff --git a/suite/sanity/resize.go b/suite/sanity/resize.go index 716271c9..79f069b6 100644 --- a/suite/sanity/resize.go +++ b/suite/sanity/resize.go @@ -40,12 +40,12 @@ func resize(p interface{}) (gravity.TestFunc, error) { g.OK("download installer", g.SetInstaller(cluster.Nodes, cfg.InstallerURL, "install")) g.OK(fmt.Sprintf("install on %d node", param.NodeCount), - g.OfflineInstall(cluster.Nodes[0:param.NodeCount], param.InstallParam)) - g.OK("status", g.Status(cluster.Nodes[0:param.NodeCount])) + g.OfflineInstall(cluster.Nodes[:param.NodeCount], param.InstallParam)) + g.OK("wait for active status", g.WaitForActiveStatus(cluster.Nodes[:param.NodeCount])) g.OK("time sync", g.CheckTimeSync(cluster.Nodes)) g.OK(fmt.Sprintf("expand to %d nodes", param.ToNodes), - g.Expand(cluster.Nodes[0:param.NodeCount], cluster.Nodes[param.NodeCount:param.ToNodes], + g.Expand(cluster.Nodes[:param.NodeCount], cluster.Nodes[param.NodeCount:param.ToNodes], param.InstallParam)) - g.OK("status", g.Status(cluster.Nodes[0:param.ToNodes])) + g.OK("wait for active status", g.WaitForActiveStatus(cluster.Nodes[:param.ToNodes])) }, nil } diff --git a/suite/sanity/shrink.go b/suite/sanity/shrink.go index b6899525..0050ab55 100644 --- a/suite/sanity/shrink.go +++ b/suite/sanity/shrink.go @@ -29,19 +29,19 @@ func shrink(p interface{}) (gravity.TestFunc, error) { g.OK("Download installer.", g.SetInstaller(all, cfg.InstallerURL, "install")) g.OK("Install.", g.OfflineInstall(others, param.InstallParam)) - g.OK("Install status.", g.Status(others)) + g.OK("Wait for active status.", g.WaitForActiveStatus(others)) joinParam := param.InstallParam joinParam.Role = "knode" g.OK("Expand.", g.Expand(others, target, joinParam)) - g.OK("Expand status.", g.Status(all)) + g.OK("Wait for active status.", g.WaitForActiveStatus(all)) roles, err := g.NodesByRole(all) g.OK("Query roles.", err) g.Logger().WithFields(logrus.Fields{"roles": roles, "nodes": all}).Info("Node roles after expand.") g.OK("Shrink.", g.Shrink(others, target)) - g.OK("Shrink status.", g.Status(others)) + g.OK("Wait for active status.", g.WaitForActiveStatus(others)) }, nil } diff --git a/suite/sanity/upgrade.go b/suite/sanity/upgrade.go index fc35307b..04cc4927 100644 --- a/suite/sanity/upgrade.go +++ b/suite/sanity/upgrade.go @@ -35,8 +35,8 @@ func upgrade(p interface{}) (gravity.TestFunc, error) { g.OK("base installer", g.SetInstaller(cluster.Nodes, param.BaseInstallerURL, "base")) g.OK("install", g.OfflineInstall(cluster.Nodes, param.InstallParam)) - g.OK("status", g.Status(cluster.Nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(cluster.Nodes)) g.OK("upgrade", g.Upgrade(cluster.Nodes, cfg.InstallerURL, cfg.GravityURL, "upgrade")) - g.OK("status", g.Status(cluster.Nodes)) + g.OK("wait for active status", g.WaitForActiveStatus(cluster.Nodes)) }, nil }