Skip to content

Commit

Permalink
fail fast autostarts (#293)
Browse files Browse the repository at this point in the history
* fail fast autostarts

* fixing commit merge

* refactoring per discussion
  • Loading branch information
autodidaddict committed Jun 25, 2024
1 parent b2539b9 commit 94eb148
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
1 change: 1 addition & 0 deletions internal/models/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ type AutostartDeployRequest struct {
Namespace string `json:"namespace"`
Argv []string `json:"argv,omitempty"`
Description *string `json:"description,omitempty"`
Essential bool `json:"essential,omitempty"`
WorkloadType controlapi.NexWorkload `json:"type"`
Location string `json:"location"`
JsDomain *string `json:"jsdomain,omitempty"`
Expand Down
26 changes: 24 additions & 2 deletions internal/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const (
publicNATSServerStartTimeout = 50 * time.Millisecond
runloopSleepInterval = 100 * time.Millisecond
runloopTickInterval = 2500 * time.Millisecond
agentPoolRetryMax = 3
)

// Nex node process
Expand Down Expand Up @@ -323,23 +324,36 @@ func (n *Node) startPublicNATS() error {
}

func (n *Node) handleAutostarts() {
successCount := 0
for _, autostart := range n.config.AutostartConfiguration.Workloads {
var agentClient *agentapi.AgentClient
var err error

retry := 0
for agentClient == nil {
agentClient, err = n.manager.SelectRandomAgent()
if err != nil {
n.log.Warn("Failed to resolve agent for autostart", slog.String("error", err.Error()))
time.Sleep(25 * time.Millisecond)
time.Sleep(50 * time.Millisecond)
retry += 1
if retry > agentPoolRetryMax {
n.log.Error("Exceeded warm agent retrieval retry count, terminating node",
slog.Int("allowed_retries", agentPoolRetryMax),
)
n.shutdown()
return
}
}
}

// functions cannot be essential
essential := autostart.Essential && autostart.WorkloadType == controlapi.NexWorkloadNative

request, err := controlapi.NewDeployRequest(
controlapi.Argv(autostart.Argv),
controlapi.Location(autostart.Location),
controlapi.Environment(autostart.Environment),
controlapi.Essential(false), // avoid startup flapping, also not supported for funcs
controlapi.Essential(essential),
controlapi.Issuer(n.issuerKeypair),
controlapi.SenderXKey(n.api.xk),
controlapi.TargetNode(n.publicKey),
Expand Down Expand Up @@ -368,6 +382,7 @@ func (n *Node) handleAutostarts() {
continue
}

// TODO: add potential backoff and retry to cacheworkload
numBytes, workloadHash, err := n.api.mgr.CacheWorkload(agentClient.ID(), request)
if err != nil {
n.api.log.Error("Failed to cache auto-start workload bytes",
Expand Down Expand Up @@ -397,6 +412,13 @@ func (n *Node) handleAutostarts() {
slog.String("namespace", autostart.Namespace),
slog.String("workload_id", agentClient.ID()),
)
successCount += 1
}
if successCount < len(n.config.AutostartConfiguration.Workloads) {
n.log.Error("Not all startup workloads suceeded",
slog.Int("expected", len(n.config.AutostartConfiguration.Workloads)),
slog.Int("actual", successCount),
)
}
}

Expand Down

0 comments on commit 94eb148

Please sign in to comment.