From c89b47786bef94ff286894337d7655ea854d1620 Mon Sep 17 00:00:00 2001 From: Yegor Jbanov Date: Thu, 4 Aug 2016 13:16:12 -0700 Subject: [PATCH] report & show agent health; color-code flaky successes Fixes https://github.com/flutter/cocoon/issues/15 --- agent/bin/agent.dart | 2 +- agent/lib/src/adb.dart | 16 ++++++ agent/lib/src/agent.dart | 45 ++++++++++++++- agent/lib/src/commands/ci.dart | 73 ++++++++++++++++++++---- agent/lib/src/firebase.dart | 14 ++++- agent/lib/src/utils.dart | 25 +++++++++ app/lib/components/status_table.dart | 83 ++++++++++++++++++++++++++-- app/lib/entity.dart | 12 ++++ app/lib/model.dart | 23 ++++++++ app/main.go | 1 + app/web/buildStyles.css | 39 ++++++++++++- commands/get_status.go | 21 +++++-- commands/update_agent_health.go | 49 ++++++++++++++++ db/db.go | 41 ++++++++++++++ db/schema.go | 10 ++++ 15 files changed, 426 insertions(+), 28 deletions(-) create mode 100644 commands/update_agent_health.go diff --git a/agent/bin/agent.dart b/agent/bin/agent.dart index 2ecf78ab72..d69b1c9b41 100644 --- a/agent/bin/agent.dart +++ b/agent/bin/agent.dart @@ -59,7 +59,7 @@ Future main(List rawArgs) async { exit(1); } - print('Agent configuration:'); + section('Agent configuration:'); print(config); await command.run(args.command); diff --git a/agent/lib/src/adb.dart b/agent/lib/src/adb.dart index 308c7af3ba..a255628902 100644 --- a/agent/lib/src/adb.dart +++ b/agent/lib/src/adb.dart @@ -46,6 +46,22 @@ class Adb { // 015d172c98400a03 device usb:340787200X product:nakasi model:Nexus_7 device:grouper static final RegExp _kDeviceRegex = new RegExp(r'^(\S+)\s+(\S+)(.*)'); + static Future> checkDevices() async { + Map results = {}; + for (String deviceId in await deviceIds) { + try { + Adb device = new Adb(deviceId: deviceId); + // Just a smoke test that we can read wakefulness state + // TODO(yjbanov): check battery level + await device._getWakefulness(); + results['android-device-$deviceId'] = new HealthCheckResult.success(); + } catch(e, s) { + results['android-device-$deviceId'] = new HealthCheckResult.error(e, s); + } + } + return results; + } + static Future> get deviceIds async { List output = (await eval(config.adbPath, ['devices', '-l'], canFail: false)) .trim().split('\n'); diff --git a/agent/lib/src/agent.dart b/agent/lib/src/agent.dart index 8e154d1db8..61ea1de5c9 100644 --- a/agent/lib/src/agent.dart +++ b/agent/lib/src/agent.dart @@ -127,8 +127,16 @@ class Agent { }); } - Future> getAuthenticationStatus() async { - return await _cocoon('get-authentication-status'); + Future getAuthenticationStatus() async { + return (await _cocoon('get-authentication-status'))['Status']; + } + + Future updateHealthStatus(AgentHealth health) async { + await _cocoon('update-agent-health', { + 'AgentID': agentId, + 'IsHealthy': health.ok, + 'HealthDetails': '$health', + }); } } @@ -162,3 +170,36 @@ abstract class Command { Future run(ArgResults args); } + +/// Overall health of the agent. +class AgentHealth { + /// Check results keyed by parameter. + final Map checks = {}; + + /// Whether all [checks] succeeded. + bool get ok => checks.isNotEmpty && checks.values.every((HealthCheckResult r) => r.succeeded); + + /// Sets a health check [result] for a given [parameter]. + operator []=(String parameter, HealthCheckResult result) { + if (checks.containsKey(parameter)) { + print('WARNING: duplicate health check ${parameter} submitted'); + } + checks[parameter] = result; + } + + void addAll(Map checks) { + checks.forEach((String p, HealthCheckResult r) { + this[p] = r; + }); + } + + /// Human-readable printout of the agent's health status. + @override + String toString() { + StringBuffer buf = new StringBuffer(); + checks.forEach((String parameter, HealthCheckResult result) { + buf.writeln('$parameter: $result'); + }); + return buf.toString(); + } +} diff --git a/agent/lib/src/commands/ci.dart b/agent/lib/src/commands/ci.dart index 5c9be096f7..b0435acc2c 100644 --- a/agent/lib/src/commands/ci.dart +++ b/agent/lib/src/commands/ci.dart @@ -31,10 +31,36 @@ class ContinuousIntegrationCommand extends Command { @override Future run(ArgResults args) async { - await _performPreflightChecks(); + // Perform one pre-flight round of checks and quit immediately if something + // is wrong. + AgentHealth health = await _performHealthChecks(); + section('Pre-flight checks:'); + print(health); + + if (!health.ok) { + print('Some pre-flight checks failed. Quitting.'); + exit(1); + } + + // Start CI mode + section('Started continuous integration:'); _listenToShutdownSignals(); while(!_exiting) { try { + // Check health before requesting a new task. + health = await _performHealthChecks(); + + // Always upload health status whether succeeded or failed. + await agent.updateHealthStatus(health); + + if (!health.ok) { + print('Some health checks failed:'); + print(health); + await new Future.delayed(_sleepBetweenBuilds); + // Don't bother requesting new tasks if health is bad. + continue; + } + CocoonTask task = await agent.reserveTask(); try { if (task != null) { @@ -67,22 +93,45 @@ class ContinuousIntegrationCommand extends Command { await forceQuitRunningProcesses(); } - // TODO(yjbanov): report health status after running the task await new Future.delayed(_sleepBetweenBuilds); } } - Future _performPreflightChecks() async { - print('Pre-flight checks:'); - await pickNextDevice(); - print(' - device connected'); - await checkFirebaseConnection(); - print(' - firebase connected'); - if (!(await agent.getAuthenticationStatus())['Status'] == 'OK') { - throw 'Failed to authenticate to Cocoon. Check config.yaml.'; + Future _performHealthChecks() async { + AgentHealth results = new AgentHealth(); + try { + results['firebase-connection'] = await checkFirebaseConnection(); + + Map deviceChecks = await Adb.checkDevices(); + results.addAll(deviceChecks); + + int healthyDeviceCount = deviceChecks.values + .where((HealthCheckResult r) => r.succeeded) + .length; + + results['has-healthy-devices'] = healthyDeviceCount > 0 + ? new HealthCheckResult.success('Found ${deviceChecks.length} healthy devices') + : new HealthCheckResult.failure('No healthy devices found'); + + try { + String authStatus = await agent.getAuthenticationStatus(); + results['cocoon-connection'] = new HealthCheckResult.success(); + + if (authStatus != 'OK') { + results['cocoon-authentication'] = new HealthCheckResult.failure('Failed to authenticate to Cocoon. Check config.yaml.'); + } else { + results['cocoon-authentication'] = new HealthCheckResult.success(); + } + } catch(e, s) { + results['cocoon-connection'] = new HealthCheckResult.error(e, s); + } + + results['able-to-perform-health-check'] = new HealthCheckResult.success(); + } catch(e, s) { + results['able-to-perform-health-check'] = new HealthCheckResult.error(e, s); } - print(' - Cocoon auth OK'); - print('Pre-flight OK'); + + return results; } /// Listens to standard output and upload logs to Cocoon in semi-realtime. diff --git a/agent/lib/src/firebase.dart b/agent/lib/src/firebase.dart index c9297295dc..4be0a49355 100644 --- a/agent/lib/src/firebase.dart +++ b/agent/lib/src/firebase.dart @@ -16,9 +16,17 @@ Firebase _measurements() { auth: firebaseToken); } -Future checkFirebaseConnection() async { - if (await _measurements().child('dashboard_bot_status').child('current').get() == null) { - throw 'Connection to Firebase is unhealthy. Failed to read the current dashboard_bot_status entity.'; +Future checkFirebaseConnection() async { + try { + if ((await _measurements().child('dashboard_bot_status').child('current').get()).val == null) { + return new HealthCheckResult.failure( + 'Connection to Firebase is unhealthy. Failed to read the current dashboard_bot_status entity.' + ); + } else { + return new HealthCheckResult.success(); + } + } catch (e, s) { + return new HealthCheckResult.error(e, s); } } diff --git a/agent/lib/src/utils.dart b/agent/lib/src/utils.dart index 621502b43e..0bafc358a7 100644 --- a/agent/lib/src/utils.dart +++ b/agent/lib/src/utils.dart @@ -38,6 +38,31 @@ class ProcessInfo { } } +/// Result of a health check for a specific parameter. +class HealthCheckResult { + HealthCheckResult.success([this.details]) : succeeded = true; + HealthCheckResult.failure(this.details) : succeeded = false; + HealthCheckResult.error(dynamic error, dynamic stackTrace) + : succeeded = false, + details = 'ERROR: $error\n${stackTrace ?? ''}'; + + final bool succeeded; + final String details; + + @override + String toString() { + StringBuffer buf = new StringBuffer(succeeded ? 'succeeded' : 'failed'); + if (details != null && details.trim().isNotEmpty) { + buf.writeln(); + // Indent details by 4 spaces + for (String line in details.trim().split('\n')) { + buf.writeln(' $line'); + } + } + return '$buf'; + } +} + class BuildFailedError extends Error { BuildFailedError(this.message); diff --git a/app/lib/components/status_table.dart b/app/lib/components/status_table.dart index ce197a4b68..2d38bc8eb4 100644 --- a/app/lib/components/status_table.dart +++ b/app/lib/components/status_table.dart @@ -16,6 +16,34 @@ import 'package:http/http.dart' as http;
Loading...
+ +
+
Agents
+
+ {{agentStatus.agentId}} +
+
+ +
+
+ [X] +
+
+ {{displayedAgentStatus.agentId}} + {{isAgentHealthy(displayedAgentStatus) ? "☺" : "☹"}} +
+
+ Last health check: {{displayedAgentStatus.healthCheckTimestamp}} + {{agentHealthCheckAge(displayedAgentStatus.healthCheckTimestamp)}} +
+
Details:
+
{{displayedAgentStatus.healthDetails}}
+
+ > resultMatrix = >{}; + List agentStatuses; + @override ngOnInit() async { reloadData(); @@ -73,6 +105,7 @@ class StatusTable implements OnInit { GetStatusResult statusResult = GetStatusResult.fromJson(statusJson); isLoading = false; + agentStatuses = statusResult.agentStatuses ?? []; List statuses = statusResult.statuses ?? []; headerCol = []; headerRow = new HeaderRow(); @@ -95,7 +128,7 @@ class StatusTable implements OnInit { return fullSha.length > 7 ? fullSha.substring(0, 7) : fullSha; } - List taskStatusToCssStyle(String taskStatus) { + List taskStatusToCssStyle(String taskStatus, int attempts) { const statusMap = const { 'New': 'task-new', 'In Progress': 'task-in-progress', @@ -104,7 +137,15 @@ class StatusTable implements OnInit { 'Underperformed': 'task-underperformed', 'Skipped': 'task-skipped', }; - return ['task-status-circle', statusMap[taskStatus] ?? 'task-unknown']; + + String cssClass; + if (taskStatus == 'Succeeded' && attempts > 1) { + cssClass = 'task-succeeded-but-flaky'; + } else { + cssClass = statusMap[taskStatus] ?? 'task-unknown'; + } + + return ['task-status-circle', cssClass]; } TaskEntity _findTask(String sha, String taskName) { @@ -120,9 +161,43 @@ class StatusTable implements OnInit { TaskEntity taskEntity = _findTask(sha, taskName); if (taskEntity == null) - return taskStatusToCssStyle('Skipped'); + return taskStatusToCssStyle('Skipped', 0); + + return taskStatusToCssStyle(taskEntity.task.status, taskEntity.task.attempts); + } + + List getAgentStyle(AgentStatus status) { + return [ + 'agent-chip', + isAgentHealthy(status) ? 'agent-healthy' : 'agent-unhealthy', + ]; + } + + /// An agent is considered healthy if the latest health report was OK and is + /// up-to-date. + bool isAgentHealthy(AgentStatus status) { + return status.isHealthy && status.healthCheckTimestamp != null && + new DateTime.now().difference(status.healthCheckTimestamp) < maxHealthCheckAge; + } + + AgentStatus displayedAgentStatus; + + void showAgentHealthDetails(AgentStatus agentStatus) { + displayedAgentStatus = agentStatus; + } + + void hideAgentHealthDetails() { + displayedAgentStatus = null; + } - return taskStatusToCssStyle(taskEntity.task.status); + String agentHealthCheckAge(DateTime dt) { + if (dt == null) + return ''; + Duration age = new DateTime.now().difference(dt); + String ageQualifier = age > maxHealthCheckAge + ? 'out-of-date!!!' + : 'old'; + return '(${age.inMinutes} minutes $ageQualifier)'; } void openLog(String sha, String taskName) { diff --git a/app/lib/entity.dart b/app/lib/entity.dart index 0d2cb0be3b..f07a9067e3 100644 --- a/app/lib/entity.dart +++ b/app/lib/entity.dart @@ -30,6 +30,9 @@ abstract class JsonSerializer { /// Serializes strings. StringSerializer string() => const StringSerializer(); +/// Serializes booleans. +BoolSerializer boolean() => const BoolSerializer(); + /// Serializes ints and doubles. NumSerializer number() => const NumSerializer(); @@ -83,6 +86,15 @@ class StringSerializer implements JsonSerializer { dynamic serialize(String value) => value; } +class BoolSerializer implements JsonSerializer { + const BoolSerializer(); + + bool deserialize(dynamic jsonValue) { + return jsonValue as bool; + } + dynamic serialize(bool value) => value; +} + class NumSerializer implements JsonSerializer { const NumSerializer(); diff --git a/app/lib/model.dart b/app/lib/model.dart index d5be569f19..adacde331a 100644 --- a/app/lib/model.dart +++ b/app/lib/model.dart @@ -36,6 +36,7 @@ class GetStatusResult extends Entity { (Map props) => new GetStatusResult(props), { 'Statuses': listOf(BuildStatus._serializer), + 'AgentStatuses': listOf(AgentStatus._serializer), } ); @@ -45,6 +46,7 @@ class GetStatusResult extends Entity { GetStatusResult([Map props]) : super(_serializer, props); List get statuses => this['Statuses']; + List get agentStatuses => this['AgentStatuses']; } class BuildStatus extends Entity { @@ -62,6 +64,25 @@ class BuildStatus extends Entity { List get stages => this['Stages']; } +class AgentStatus extends Entity { + static final _serializer = new EntitySerializer( + (Map props) => new AgentStatus(props), + { + 'AgentID': string(), + 'IsHealthy': boolean(), + 'HealthCheckTimestamp': dateTime(), + 'HealthDetails': string(), + } + ); + + AgentStatus([Map props]) : super(_serializer, props); + + String get agentId => this['AgentID']; + bool get isHealthy => this['IsHealthy']; + DateTime get healthCheckTimestamp => this['HealthCheckTimestamp']; + String get healthDetails => this['HealthDetails']; +} + class CommitInfo extends Entity { static final _serializer = new EntitySerializer( (Map props) => new CommitInfo(props), @@ -164,6 +185,7 @@ class Task extends Entity { 'Status': string(), 'StartTimestamp': dateTime(), 'EndTimestamp': dateTime(), + 'Attempts': number(), } ); @@ -175,4 +197,5 @@ class Task extends Entity { String get status => this['Status']; DateTime get startTimestamp => this['StartTimestamp']; DateTime get endTimestamp => this['EndTimestamp']; + int get attempts => this['Attempts']; } diff --git a/app/main.go b/app/main.go index 043f0b8d0f..43a673f952 100644 --- a/app/main.go +++ b/app/main.go @@ -33,6 +33,7 @@ func init() { registerRPC("/api/refresh-travis-status", commands.RefreshTravisStatus) registerRPC("/api/refresh-chromebot-status", commands.RefreshChromebotStatus) registerRPC("/api/reserve-task", commands.ReserveTask) + registerRPC("/api/update-agent-health", commands.UpdateAgentHealth) registerRPC("/api/update-task-status", commands.UpdateTaskStatus) registerRPC("/api/vacuum-clean", commands.VacuumClean) diff --git a/app/web/buildStyles.css b/app/web/buildStyles.css index 4e7682fa57..5dfda47353 100644 --- a/app/web/buildStyles.css +++ b/app/web/buildStyles.css @@ -298,12 +298,16 @@ td.stats-value { background-color: green; cursor: pointer; } +.task-succeeded-but-flaky { + background-color: #cccc00; + cursor: pointer; +} .task-failed { background-color: red; cursor: pointer; } .task-underperformed { - background-color: yellow; + background-color: orange; cursor: pointer; } .task-skipped { @@ -322,3 +326,36 @@ td.stats-value { transform: rotateZ(180deg); } } + +.agent-bar { + display: flex; + margin-bottom: 10px; +} + +.agent-bar > * { + align-self: center; + padding: 7px; +} + +.agent-chip { + border-radius: 2px; + margin-left: 10px; + color: white; + cursor: pointer; +} + +.agent-healthy { + background-color: #33CC33; +} + +.agent-unhealthy { + background-color: #CC3333; +} + +.agent-health-details-card { + position: relative; + background-color: #DDD; + border-radius: 2px; + padding: 15px; + margin-bottom: 10px; +} diff --git a/commands/get_status.go b/commands/get_status.go index 0656c42992..4d53355ea2 100644 --- a/commands/get_status.go +++ b/commands/get_status.go @@ -12,7 +12,8 @@ type GetStatusCommand struct { // GetStatusResult contains dashboard status. type GetStatusResult struct { - Statuses []*BuildStatus + Statuses []*BuildStatus + AgentStatuses []*db.AgentStatus } // BuildStatus contains build status information about a particular checklist. @@ -32,10 +33,11 @@ func GetStatus(c *db.Cocoon, inputJSON []byte) (interface{}, error) { var statuses []*BuildStatus for _, checklist := range checklists { - stages, err := c.QueryTasksGroupedByStage(checklist.Key) + // Need to define another error variable to not "shadow" the other one, Go figure! + stages, errr := c.QueryTasksGroupedByStage(checklist.Key) - if err != nil { - return nil, err + if errr != nil { + return nil, errr } statuses = append(statuses, &BuildStatus{ @@ -44,5 +46,14 @@ func GetStatus(c *db.Cocoon, inputJSON []byte) (interface{}, error) { }) } - return &GetStatusResult{statuses}, nil + agentStatuses, err := c.QueryAgentStatuses() + + if err != nil { + return nil, err + } + + return &GetStatusResult{ + Statuses: statuses, + AgentStatuses: agentStatuses, + }, nil } diff --git a/commands/update_agent_health.go b/commands/update_agent_health.go new file mode 100644 index 0000000000..5334858e7f --- /dev/null +++ b/commands/update_agent_health.go @@ -0,0 +1,49 @@ +// Copyright (c) 2016 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package commands + +import ( + "cocoon/db" + "encoding/json" + "fmt" +) + +// UpdateAgentHealthCommand updates health status of an agent. +type UpdateAgentHealthCommand struct { + AgentID string + IsHealthy bool // overall health status + HealthDetails string // a human-readable printout health details +} + +// UpdateAgentHealth updates health status of an agent. +func UpdateAgentHealth(cocoon *db.Cocoon, inputJSON []byte) (interface{}, error) { + agent := cocoon.CurrentAgent() + + if agent == nil { + return nil, fmt.Errorf("This command must be executed by an agent") + } + + var command *UpdateAgentHealthCommand + err := json.Unmarshal(inputJSON, &command) + + if err != nil { + return nil, err + } + + if agent.AgentID != command.AgentID { + messageFormat := "Currently signed in agent's ID (%v) does not match agent ID supplied in the request (%v)" + return nil, fmt.Errorf(messageFormat, agent.AgentID, command.AgentID) + } + + agent.IsHealthy = command.IsHealthy + agent.HealthDetails = command.HealthDetails + agent.HealthCheckTimestamp = db.NowMillis() + + if err := cocoon.UpdateAgent(agent); err != nil { + return nil, err + } + + return "OK", nil +} diff --git a/db/db.go b/db/db.go index 75286626f2..ebfe0d00c0 100644 --- a/db/db.go +++ b/db/db.go @@ -373,6 +373,47 @@ func (c *Cocoon) GetAgentByAuthToken(agentID string, authToken string) (*Agent, return agent, nil } +// QueryAgentStatuses fetches statuses for all agents. +func (c *Cocoon) QueryAgentStatuses() ([]*AgentStatus, error) { + query := datastore.NewQuery("Agent").Order("AgentID") + var buffer []*AgentStatus + for iter := query.Run(c.Ctx); ; { + var agent Agent + _, err := iter.Next(&agent) + if err == datastore.Done { + break + } else if err != nil { + return nil, err + } + + buffer = append(buffer, &AgentStatus{ + AgentID: agent.AgentID, + IsHealthy: agent.IsHealthy, + HealthDetails: agent.HealthDetails, + HealthCheckTimestamp: agent.HealthCheckTimestamp, + Capabilities: agent.Capabilities, + }) + } + return buffer, nil +} + +// UpdateAgent updates an agent record. +func (c *Cocoon) UpdateAgent(agent *Agent) error { + agentKey := c.newAgentKey(agent.AgentID) + originalAgent, err := c.GetAgent(agent.AgentID) + + if err != nil { + return err + } + + // Do not allow updating the auth token + // TODO(yjbanov): auth token can be moved to a child entity, avoiding this problem. + agent.AuthTokenHash = originalAgent.AuthTokenHash + + _, err = datastore.Put(c.Ctx, agentKey, agent) + return err +} + var urlSafeChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") // Generates a token along with its hash for storing in the database. The diff --git a/db/schema.go b/db/schema.go index 4671157bbb..6ab78b650c 100644 --- a/db/schema.go +++ b/db/schema.go @@ -115,10 +115,20 @@ type Agent struct { AgentID string IsHealthy bool HealthCheckTimestamp int64 + HealthDetails string // a human-readable printout of health details AuthTokenHash []byte Capabilities []string } +// AgentStatus contains agent health status. +type AgentStatus struct { + AgentID string + IsHealthy bool + HealthCheckTimestamp int64 + HealthDetails string + Capabilities []string +} + // WhitelistedAccount gives permission to access the dashboard to a specific // Google account. //