Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions api/proto/teleport/autoupdate/v1/autoupdate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ message AgentAutoUpdateGroup {
// wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure".
// This field must be positive.
int32 wait_hours = 5;
// canary_count is the number of canary agents that will be updated before the whole group is updated.
// when set to 0, the group does not enter the canary phase. This number is capped to 5.
// This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
int32 canary_count = 6;
}

// AutoUpdateVersion is a resource singleton with version required for
Expand Down
6 changes: 6 additions & 0 deletions api/types/autoupdate/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ func checkAgentSchedules(c *autoupdate.AutoUpdateConfig) error {
if group.StartHour > 23 || group.StartHour < 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].start_hour must be between 0 and 23", i)
}
if group.CanaryCount < 0 || group.CanaryCount > MaxCanaryCount {
return trace.BadParameter("spec.agents.schedule.regular[%d].canary_count must be between 0 and %d", i, MaxCanaryCount)
}
if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.CanaryCount != 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].canary_count is not zero but the strategy %q doesn't support canaries", i, AgentsStrategyTimeBased)
}
if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.WaitHours != 0 {
return trace.BadParameter("spec.agents.schedules.regular[%d].wait_hours must be zero when strategy is %s", i, AgentsStrategyTimeBased)
}
Expand Down
22 changes: 22 additions & 0 deletions api/types/autoupdate/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,28 @@ func TestValidateAutoUpdateConfig(t *testing.T) {
},
assertErr: require.Error,
},
{
name: "group with too many canaries",
config: &autoupdate.AutoUpdateConfig{
Kind: types.KindAutoUpdateConfig,
Version: types.V1,
Metadata: &headerv1.Metadata{
Name: types.MetaNameAutoUpdateConfig,
},
Spec: &autoupdate.AutoUpdateConfigSpec{
Agents: &autoupdate.AutoUpdateConfigSpecAgents{
Mode: AgentsUpdateModeEnabled,
Strategy: AgentsStrategyHaltOnError,
Schedules: &autoupdate.AgentAutoUpdateSchedules{
Regular: []*autoupdate.AgentAutoUpdateGroup{
{Name: "g1", Days: []string{"*"}, WaitHours: 0, CanaryCount: 123},
},
},
},
},
},
assertErr: require.Error,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion api/types/autoupdate/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ const (
// MaxCanaryCount is the maximum number of canaries allowed for a single group.
// This value is arbitrarily low to avoid XXL rollouts to grow over the max backend
// item size.
MaxCanaryCount = 10
MaxCanaryCount = 5
)
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ resource, which you can apply after installing the Teleport Kubernetes operator.

|Field|Type|Description|
|---|---|---|
|canary_count|integer|canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.|
|days|[]string|days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"|
|name|string|name of the group|
|start_hour|integer|start_hour to initiate update|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Optional:

Optional:

- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
- `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"
- `name` (String) name of the group
- `start_hour` (Number) start_hour to initiate update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Optional:

Optional:

- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.
- `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"
- `name` (String) name of the group
- `start_hour` (Number) start_hour to initiate update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ spec:
description: regular schedules for non-critical versions.
items:
properties:
canary_count:
description: canary_count is the number of canary agents
that will be updated before the whole group is updated.
when set to 0, the group does not enter the canary
phase. This number is capped to 5. This number must
always be lower than the total number of agents in
the group, else the rollout will be stuck.
format: int32
type: integer
days:
description: days when the update can run. Supported
values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,15 @@ spec:
description: regular schedules for non-critical versions.
items:
properties:
canary_count:
description: canary_count is the number of canary agents
that will be updated before the whole group is updated.
when set to 0, the group does not enter the canary
phase. This number is capped to 5. This number must
always be lower than the total number of agents in
the group, else the rollout will be stuck.
format: int32
type: integer
days:
description: days when the update can run. Supported
values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ func (r *reconciler) makeGroupsStatus(ctx context.Context, schedules *autoupdate
ConfigDays: group.Days,
ConfigStartHour: group.StartHour,
ConfigWaitHours: group.WaitHours,
CanaryCount: uint64(group.CanaryCount),
}
}
return groups, nil
Expand Down
6 changes: 4 additions & 2 deletions lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ func setGroupState(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, newState
if previousState != newState {
group.State = newState
changed = true
// If we just started the group, also update the start time
if newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE ||
// If we just started the group, also update the start time.
// If we are doing a canary -> active transition, we don't override the start date.
if (newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE &&
previousState != autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY) ||
newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY {
group.StartTime = timestamppb.New(now)
}
Expand Down
14 changes: 3 additions & 11 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,16 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, spec *autoupd
return nil
}

const (
canaryCount = 5
canaryThreshold = 20
)

func shouldUseCanaries(currentCount int) bool {
func shouldUseCanaries(group *autoupdate.AutoUpdateAgentRolloutStatusGroup) bool {
// in the future we might change this logic to be a multiple of the required canary count
// and make the canary count dynamic
return currentCount >= canaryThreshold
return group.CanaryCount > 0
}

func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, agentCount int, status *autoupdate.AutoUpdateAgentRolloutStatus) {
group.InitialCount = uint64(agentCount)

if !shouldUseCanaries(agentCount) {
if !shouldUseCanaries(group) {
h.log.DebugContext(ctx, "Skipping canary rollout, transitioning directly to the active state", "group", group.Name)
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
return
Expand All @@ -218,9 +213,6 @@ func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate.
}

func (h *haltOnErrorStrategy) sampleCanaries(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
if group.CanaryCount == 0 {
group.CanaryCount = canaryCount
}
// Check if we need to pick more canaries
if len(group.Canaries) < int(group.CanaryCount) {
previousLength := len(group.Canaries)
Expand Down
8 changes: 5 additions & 3 deletions lib/autoupdate/rollout/strategy_haltonerror_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
ConfigStartHour: matchingStartHour,
PresentCount: 12,
UpToDateCount: 3,
CanaryCount: 5,
},
},
reports: canaryTestReports,
Expand Down Expand Up @@ -872,9 +873,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
InitialCount: 34,
PresentCount: 34,
UpToDateCount: 10,
// Checking that if CanaryCount is not set/null (e.g. we came from a manual transition)
// We still set it instead of jumping to the active state.
CanaryCount: 0,
CanaryCount: 5,
},
},
reports: canaryTestReports,
Expand Down Expand Up @@ -940,6 +939,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
{
Name: group1Name,
State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY,
StartTime: timestamppb.New(clock.Now()),
LastUpdateTime: timestamppb.New(clock.Now()),
LastUpdateReason: updateReasonCanStart,
ConfigDays: canStartToday,
Expand Down Expand Up @@ -978,6 +978,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
{
Name: group1Name,
State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY,
StartTime: timestamppb.New(clock.Now()),
LastUpdateTime: timestamppb.New(clock.Now()),
LastUpdateReason: updateReasonCanStart,
ConfigDays: canStartToday,
Expand Down Expand Up @@ -1058,6 +1059,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) {
Name: group1Name,
State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY,
LastUpdateTime: timestamppb.New(clock.Now()),
StartTime: timestamppb.New(clock.Now()),
LastUpdateReason: updateReasonCanStart,
ConfigDays: canStartToday,
ConfigStartHour: matchingStartHour,
Expand Down
2 changes: 1 addition & 1 deletion lib/autoupdate/rollout/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au

switch desiredState {
case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED:
if shouldUseCanaries(initialCount) {
if shouldUseCanaries(group) {
// We switch to the canary state but we don't sample canaries now.
// Canary sampling will happen during the next reconciliation.
desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY
Expand Down
Loading