diff --git a/app/cron.yaml b/app/cron.yaml index a19bd9ebac..e132418fb1 100644 --- a/app/cron.yaml +++ b/app/cron.yaml @@ -8,3 +8,6 @@ cron: - description: refresh chromebot build status url: /api/refresh-chromebot-status schedule: every 3 minutes +- description: clean up stale datastore records + url: /api/vacuum-clean + schedule: every 10 minutes diff --git a/app/main.go b/app/main.go index c02c70820d..043f0b8d0f 100644 --- a/app/main.go +++ b/app/main.go @@ -34,6 +34,7 @@ func init() { registerRPC("/api/refresh-chromebot-status", commands.RefreshChromebotStatus) registerRPC("/api/reserve-task", commands.ReserveTask) registerRPC("/api/update-task-status", commands.UpdateTaskStatus) + registerRPC("/api/vacuum-clean", commands.VacuumClean) registerRawHandler("/api/append-log", commands.AppendLog) registerRawHandler("/api/get-log", commands.GetLog) diff --git a/commands/refresh_github_commits.go b/commands/refresh_github_commits.go index 698fa110a8..8a4255c5a2 100644 --- a/commands/refresh_github_commits.go +++ b/commands/refresh_github_commits.go @@ -8,7 +8,6 @@ import ( "cocoon/db" "encoding/json" "io/ioutil" - "time" "golang.org/x/net/context" @@ -64,7 +63,7 @@ func RefreshGithubCommits(cocoon *db.Cocoon, inputJSON []byte) (interface{}, err // Sync to datastore var commitResults []CommitSyncResult commitResults = make([]CommitSyncResult, len(commits), len(commits)) - nowMillisSinceEpoch := time.Now().UnixNano() / 1000000 + nowMillisSinceEpoch := db.NowMillis() // To be able to use `CreateTimestamp` field for sorting topologically we have // to save ranges of commits with no gaps. Therefore we save all of them in diff --git a/commands/reserve_task.go b/commands/reserve_task.go index 869a86f9df..e34121eb7f 100644 --- a/commands/reserve_task.go +++ b/commands/reserve_task.go @@ -8,7 +8,6 @@ import ( "cocoon/db" "encoding/json" "fmt" - "time" "golang.org/x/net/context" @@ -152,7 +151,8 @@ func atomicallyReserveTask(cocoon *db.Cocoon, taskKey *datastore.Key, agent *db. } task.Status = "In Progress" - task.StartTimestamp = time.Now().UnixNano() / 1000000 + task.Attempts++ + task.StartTimestamp = db.NowMillis() task.ReservedForAgentID = agent.AgentID taskEntity, err = txc.PutTask(taskEntity.Key, task) return err diff --git a/commands/update_task_status.go b/commands/update_task_status.go index a7ee4a93b8..00b0d9237e 100644 --- a/commands/update_task_status.go +++ b/commands/update_task_status.go @@ -44,7 +44,21 @@ func UpdateTaskStatus(c *db.Cocoon, inputJSON []byte) (interface{}, error) { return nil, err } - task.Task.Status = db.TaskStatusByName(command.NewStatus) + newStatus := db.TaskStatusByName(command.NewStatus) + + if newStatus != db.TaskFailed { + task.Task.Status = newStatus + } else { + // Attempt to deflake the test by giving another chance. + if task.Task.Attempts >= db.MaxAttempts { + task.Task.Status = db.TaskFailed + task.Task.Reason = "Task failed on agent" + } else { + // This will cause this task to be picked up by an agent again. + task.Task.Status = db.TaskNew + task.Task.StartTimestamp = 0 + } + } c.PutTask(task.Key, task.Task) diff --git a/commands/vacuum_clean.go b/commands/vacuum_clean.go new file mode 100644 index 0000000000..fb3952785f --- /dev/null +++ b/commands/vacuum_clean.go @@ -0,0 +1,66 @@ +// Copyright (c) 2016 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package commands + +import "cocoon/db" + +// VacuumClean cleans up stale datastore records. +func VacuumClean(cocoon *db.Cocoon, inputJSON []byte) (interface{}, error) { + entities, err := cocoon.QueryAllPendingTasks() + + if err != nil { + return nil, err + } + + for _, fullTask := range entities { + switch fullTask.TaskEntity.Task.Status { + case db.TaskNew: + err = vacuumNewTask(cocoon, fullTask) + case db.TaskInProgress: + err = vacuumInProgressTask(cocoon, fullTask) + } + } + + if err != nil { + return nil, err + } + + return "OK", nil +} + +var oneHourMillis = int64(3600 * 1000) +var fourDaysMillis = 4 * 24 * oneHourMillis + +// If a task is sitting in "New" status for days, chances are there's no agent +// that's capable of running it, perhaps requirements are too strict for any +// existing agent to satisfy. +func vacuumNewTask(cocoon *db.Cocoon, fullTask *db.FullTask) error { + task := fullTask.TaskEntity.Task + if task.AgeInMillis() > fourDaysMillis { + task.Status = db.TaskFailed + task.Reason = "No agent accepted this task in 4 days" + cocoon.PutTask(fullTask.TaskEntity.Key, task) + } + return nil +} + +// If a task is "In Progress" for too long, chances are the agent is stuck or is +// unable to report the results. Give it another chance, perhaps on another +// build agent. +func vacuumInProgressTask(cocoon *db.Cocoon, fullTask *db.FullTask) error { + task := fullTask.TaskEntity.Task + if db.NowMillis()-task.StartTimestamp > oneHourMillis { + if task.Attempts >= db.MaxAttempts { + task.Status = db.TaskFailed + task.Reason = "Task timed out after 1 hour" + } else { + // This will cause this task to be picked up by an agent again. + task.Status = db.TaskNew + task.StartTimestamp = 0 + } + cocoon.PutTask(fullTask.TaskEntity.Key, task) + } + return nil +} diff --git a/db/db.go b/db/db.go index ba42a8a5c8..75286626f2 100644 --- a/db/db.go +++ b/db/db.go @@ -179,6 +179,15 @@ type FullTask struct { ChecklistEntity *ChecklistEntity } +// QueryAllPendingTasks queries all tasks that are not in a final state. +// +// The query goes back up to 20 checklists. +// +// See also IsFinal. +func (c *Cocoon) QueryAllPendingTasks() ([]*FullTask, error) { + return c.QueryPendingTasks("") +} + // QueryPendingTasks lists the latest tasks with the given name that are not yet // in a final status. // @@ -194,9 +203,13 @@ func (c *Cocoon) QueryPendingTasks(taskName string) ([]*FullTask, error) { for i := len(checklists) - 1; i >= 0; i-- { query := datastore.NewQuery("Task"). Ancestor(checklists[i].Key). - Filter("Name =", taskName). Order("-CreateTimestamp"). Limit(20) + + if taskName != "" { + query = query.Filter("Name =", taskName) + } + candidates, err := c.runTaskQuery(query) if err != nil { @@ -514,3 +527,13 @@ func (c *Cocoon) PutLogChunk(ownerKey *datastore.Key, data []byte) error { _, err := datastore.Put(c.Ctx, key, chunk) return err } + +// NowMillis returns the number of milliseconds since the UNIX epoch. +func NowMillis() int64 { + return time.Now().UnixNano() / 1000000 +} + +// AgeInMillis returns the current age of the task in milliseconds. +func (t *Task) AgeInMillis() int64 { + return NowMillis() - t.CreateTimestamp +} diff --git a/db/schema.go b/db/schema.go index b1aa84523a..4671157bbb 100644 --- a/db/schema.go +++ b/db/schema.go @@ -68,12 +68,23 @@ type Task struct { // Capabilities an agent must have to be able to perform this task. RequiredCapabilities []string Status TaskStatus - ReservedForAgentID string - CreateTimestamp int64 - StartTimestamp int64 - EndTimestamp int64 + + // Explains the value of the current task Status. For example, if Status is + // "Failed", then Reason might be "Timed out". + Reason string + + // The number of times Cocoon attempted to run the Task. + Attempts int64 + ReservedForAgentID string + CreateTimestamp int64 + StartTimestamp int64 + EndTimestamp int64 } +// MaxAttempts is the maximum number of times a single task will be attempted +// before giving up on it. +const MaxAttempts = 3 + // TaskStatus indicates the status of a task. type TaskStatus string