Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 160 additions & 36 deletions api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions api/proto/teleport/autoupdate/v1/autoupdate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ message AgentAutoUpdateGroup {
// wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure".
// This field must be positive.
int32 wait_hours = 5;
// canary_count is the number of canary agents that will be updated before the whole group is updated.
// when set to 0, the group does not enter the canary phase. This number is capped to 5.
// This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
int32 canary_count = 6;
}

// AutoUpdateVersion is a resource singleton with version required for
Expand Down Expand Up @@ -230,6 +234,12 @@ message AutoUpdateAgentRolloutStatusGroup {
// to the done state if:
// - the ratio present_count/initial_count is above 0.9 (no more than 10% of the nodes dropped during update)
uint64 up_to_date_count = 12;
// canary_count represents how many canaries this group should have to leave the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY
// state.
uint64 canary_count = 13;
// canaries is the list of canary agents that should be updated.
// This list is empty until we enter the AUTO_UPDATE_AGENT_GROUP_STATE_CANARY state.
repeated Canary canaries = 14;
}

// AutoUpdateAgentGroupState represents the agent group state. This state controls whether the agents from this group
Expand All @@ -247,6 +257,9 @@ enum AutoUpdateAgentGroupState {
// AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK represents that the group has been rolled back.
// New agents should run v1, existing agents should update to v1.
AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK = 4;
// AUTO_UPDATE_AGENT_GROUP_STATE_CANARY represents that the group is updating a few canary nodes, but that most nodes
// have not started updating yet.
AUTO_UPDATE_AGENT_GROUP_STATE_CANARY = 5;
}

// AutoUpdateAgentRolloutState represents the rollout state. This tells if Teleport started updating agents from the
Expand Down Expand Up @@ -305,3 +318,18 @@ message AutoUpdateAgentReportSpecOmitted {
int64 count = 1;
string reason = 2;
}

// Canary describes a node that is acting as a canary and being updated before other nodes in its group.
message Canary {
// updater_id is reported by the agent in its control stream Hello. This allows us to uniquely identify an updater so
// the proxy can modulate its answer when the request comes from this specific updater.
string updater_id = 1;
// host_id is the node Host ID, reported by the agent in its control stream Hello.
string host_id = 2;
// hostname is the server hostname reported by the agent in its control stream Hello.
// This is purely for debugging purposes: if the agent drops, we won't be able to query the inventory to know which
// agent it was.
string hostname = 3;
// success represents if the agent successfully connected back, running the target version.
bool success = 4;
}
6 changes: 6 additions & 0 deletions api/types/autoupdate/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ func checkAgentSchedules(c *autoupdate.AutoUpdateConfig) error {
if group.StartHour > 23 || group.StartHour < 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].start_hour must be between 0 and 23", i)
}
if group.CanaryCount < 0 || group.CanaryCount > MaxCanaryCount {
return trace.BadParameter("spec.agents.schedule.regular[%d].canary_count must be between 0 and %d", i, MaxCanaryCount)
}
if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.CanaryCount != 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].canary_count is not zero but the strategy %q doesn't support canaries", i, AgentsStrategyTimeBased)
}
if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.WaitHours != 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].wait_hours must be zero when strategy is %s", i, AgentsStrategyTimeBased)
}
Expand Down
22 changes: 22 additions & 0 deletions api/types/autoupdate/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,28 @@ func TestValidateAutoUpdateConfig(t *testing.T) {
},
assertErr: require.Error,
},
{
name: "group with too many canaries",
config: &autoupdate.AutoUpdateConfig{
Kind: types.KindAutoUpdateConfig,
Version: types.V1,
Metadata: &headerv1.Metadata{
Name: types.MetaNameAutoUpdateConfig,
},
Spec: &autoupdate.AutoUpdateConfigSpec{
Agents: &autoupdate.AutoUpdateConfigSpecAgents{
Mode: AgentsUpdateModeEnabled,
Strategy: AgentsStrategyHaltOnError,
Schedules: &autoupdate.AgentAutoUpdateSchedules{
Regular: []*autoupdate.AgentAutoUpdateGroup{
{Name: "g1", Days: []string{"*"}, WaitHours: 0, CanaryCount: 123},
},
},
},
},
},
assertErr: require.Error,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions api/types/autoupdate/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,9 @@ const (
// maintenance window. There is no dependency between groups. Agents won't be instructed to update
// if the window is over.
AgentsStrategyTimeBased = "time-based"

// MaxCanaryCount is the maximum number of canaries allowed for a single group.
// This value is arbitrarily low to avoid XXL rollouts to grow over the max backend
// item size.
MaxCanaryCount = 5
)
6 changes: 6 additions & 0 deletions api/types/autoupdate/rollout.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ func ValidateAutoUpdateAgentRollout(v *autoupdate.AutoUpdateAgentRollout) error
if v.Spec.Strategy == AgentsStrategyHaltOnError && i == 0 && group.ConfigWaitHours != 0 {
return trace.BadParameter("status.schedules.groups[0].config_wait_hours must be zero as it's the first group")
}
if group.CanaryCount > MaxCanaryCount {
return trace.BadParameter("status.schedules.groups[%d].canary_count must be less than %d", i, MaxCanaryCount)
}
if len(group.Canaries) > MaxCanaryCount {
return trace.BadParameter("status.schedules.groups[%d].canaries must be contain less than %d elements", i, MaxCanaryCount)
}
if conflictingGroup, ok := seenGroups[group.Name]; ok {
return trace.BadParameter("spec.agents.schedules.regular contains groups with the same name %q at indices %d and %d", group.Name, conflictingGroup, i)
}
Expand Down
17 changes: 17 additions & 0 deletions api/types/autoupdate/rollout_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,23 @@ func TestValidateAutoUpdateAgentRollout(t *testing.T) {
},
assertErr: require.Error,
},
{
name: "group with too high canary count",
rollout: &autoupdate.AutoUpdateAgentRollout{
Kind: types.KindAutoUpdateAgentRollout,
Version: types.V1,
Metadata: &headerv1.Metadata{
Name: types.MetaNameAutoUpdateAgentRollout,
},
Spec: &haltOnErrorRolloutSpec,
Status: &autoupdate.AutoUpdateAgentRolloutStatus{
Groups: []*autoupdate.AutoUpdateAgentRolloutStatusGroup{
{Name: "g1", ConfigDays: []string{"*"}, CanaryCount: 15},
},
},
},
assertErr: require.Error,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ resource, which you can apply after installing the Teleport Kubernetes operator.

|Field|Type|Description|
|---|---|---|
|canary_count|integer|canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.|
|days|[]string|days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"|
|name|string|name of the group|
|start_hour|integer|start_hour to initiate update|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Optional:

Optional:

- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
- `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"
- `name` (String) name of the group
- `start_hour` (Number) start_hour to initiate update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Optional:

Optional:

- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
- `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"
- `name` (String) name of the group
- `start_hour` (Number) start_hour to initiate update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ spec:
description: regular schedules for non-critical versions.
items:
properties:
canary_count:
description: canary_count is the number of canary agents
that will be updated before the whole group is updated.
when set to 0, the group does not enter the canary
phase. This number is capped to 5. This number must
always be lower than the total number of agents in
the group, else the rollout will be stuck.
format: int32
type: integer
days:
description: days when the update can run. Supported
values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ spec:
description: regular schedules for non-critical versions.
items:
properties:
canary_count:
description: canary_count is the number of canary agents
that will be updated before the whole group is updated.
when set to 0, the group does not enter the canary
phase. This number is capped to 5. This number must
always be lower than the total number of agents in
the group, else the rollout will be stuck.
format: int32
type: integer
days:
description: days when the update can run. Supported
values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading