diff --git a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go index acd17cbe9f817..a2aef34867de0 100644 --- a/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go +++ b/api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go @@ -481,7 +481,11 @@ type AgentAutoUpdateGroup struct { StartHour int32 `protobuf:"varint,3,opt,name=start_hour,json=startHour,proto3" json:"start_hour,omitempty"` // wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure". // This field must be positive. - WaitHours int32 `protobuf:"varint,5,opt,name=wait_hours,json=waitHours,proto3" json:"wait_hours,omitempty"` + WaitHours int32 `protobuf:"varint,5,opt,name=wait_hours,json=waitHours,proto3" json:"wait_hours,omitempty"` + // canary_count is the number of canary agents that will be updated before the whole group is updated. + // when set to 0, the group does not enter the canary phase. This number is capped to 5. + // This number must always be lower than the total number of agents in the group, else the rollout will be stuck. + CanaryCount int32 `protobuf:"varint,6,opt,name=canary_count,json=canaryCount,proto3" json:"canary_count,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -544,6 +548,13 @@ func (x *AgentAutoUpdateGroup) GetWaitHours() int32 { return 0 } +func (x *AgentAutoUpdateGroup) GetCanaryCount() int32 { + if x != nil { + return x.CanaryCount + } + return 0 +} + // AutoUpdateVersion is a resource singleton with version required for // tools autoupdate. type AutoUpdateVersion struct { @@ -1627,14 +1638,15 @@ const file_teleport_autoupdate_v1_autoupdate_proto_rawDesc = "" + "\x1bmaintenance_window_duration\x18\x03 \x01(\v2\x19.google.protobuf.DurationR\x19maintenanceWindowDuration\x12N\n" + "\tschedules\x18\x06 \x01(\v20.teleport.autoupdate.v1.AgentAutoUpdateSchedulesR\tschedulesJ\x04\b\x05\x10\x06R\x0fagent_schedules\"b\n" + "\x18AgentAutoUpdateSchedules\x12F\n" + - "\aregular\x18\x01 \x03(\v2,.teleport.autoupdate.v1.AgentAutoUpdateGroupR\aregular\"\x8d\x01\n" + + "\aregular\x18\x01 \x03(\v2,.teleport.autoupdate.v1.AgentAutoUpdateGroupR\aregular\"\xb0\x01\n" + "\x14AgentAutoUpdateGroup\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x12\n" + "\x04days\x18\x02 \x03(\tR\x04days\x12\x1d\n" + "\n" + "start_hour\x18\x03 \x01(\x05R\tstartHour\x12\x1d\n" + "\n" + - "wait_hours\x18\x05 \x01(\x05R\twaitHoursJ\x04\b\x04\x10\x05R\twait_days\"\xd9\x01\n" + + "wait_hours\x18\x05 \x01(\x05R\twaitHours\x12!\n" + + "\fcanary_count\x18\x06 \x01(\x05R\vcanaryCountJ\x04\b\x04\x10\x05R\twait_days\"\xd9\x01\n" + "\x11AutoUpdateVersion\x12\x12\n" + "\x04kind\x18\x01 \x01(\tR\x04kind\x12\x19\n" + "\bsub_kind\x18\x02 \x01(\tR\asubKind\x12\x18\n" + diff --git a/api/proto/teleport/autoupdate/v1/autoupdate.proto b/api/proto/teleport/autoupdate/v1/autoupdate.proto index ff42592c6fb8c..005d29996c8d7 100644 --- a/api/proto/teleport/autoupdate/v1/autoupdate.proto +++ b/api/proto/teleport/autoupdate/v1/autoupdate.proto @@ -83,6 +83,10 @@ message AgentAutoUpdateGroup { // wait_hours after last group succeeds before this group can run. This can only be used when the strategy is "halt-on-failure". // This field must be positive. int32 wait_hours = 5; + // canary_count is the number of canary agents that will be updated before the whole group is updated. + // when set to 0, the group does not enter the canary phase. This number is capped to 5. + // This number must always be lower than the total number of agents in the group, else the rollout will be stuck. + int32 canary_count = 6; } // AutoUpdateVersion is a resource singleton with version required for diff --git a/api/types/autoupdate/config.go b/api/types/autoupdate/config.go index ad79765895c0d..37a949606e470 100644 --- a/api/types/autoupdate/config.go +++ b/api/types/autoupdate/config.go @@ -104,6 +104,12 @@ func checkAgentSchedules(c *autoupdate.AutoUpdateConfig) error { if group.StartHour > 23 || group.StartHour < 0 { return trace.BadParameter("spec.agents.schedules.regular[%d].start_hour must be between 0 and 23", i) } + if group.CanaryCount < 0 || group.CanaryCount > MaxCanaryCount { + return trace.BadParameter("spec.agents.schedule.regular[%d].canary_count must be between 0 and %d", i, MaxCanaryCount) + } + if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.CanaryCount != 0 { + return trace.BadParameter("spec.agents.schedules.regular[%d].canary_count is not zero but the strategy %q doesn't support canaries", i, AgentsStrategyTimeBased) + } if c.Spec.Agents.Strategy == AgentsStrategyTimeBased && group.WaitHours != 0 { return trace.BadParameter("spec.agents.schedules.regular[%d].wait_hours must be zero when strategy is %s", i, AgentsStrategyTimeBased) } diff --git a/api/types/autoupdate/config_test.go b/api/types/autoupdate/config_test.go index 0981dd7e681c1..6a022f11500f9 100644 --- a/api/types/autoupdate/config_test.go +++ b/api/types/autoupdate/config_test.go @@ -465,6 +465,28 @@ func TestValidateAutoUpdateConfig(t *testing.T) { }, assertErr: require.Error, }, + { + name: "group with too many canaries", + config: &autoupdate.AutoUpdateConfig{ + Kind: types.KindAutoUpdateConfig, + Version: types.V1, + Metadata: &headerv1.Metadata{ + Name: types.MetaNameAutoUpdateConfig, + }, + Spec: &autoupdate.AutoUpdateConfigSpec{ + Agents: &autoupdate.AutoUpdateConfigSpecAgents{ + Mode: AgentsUpdateModeEnabled, + Strategy: AgentsStrategyHaltOnError, + Schedules: &autoupdate.AgentAutoUpdateSchedules{ + Regular: []*autoupdate.AgentAutoUpdateGroup{ + {Name: "g1", Days: []string{"*"}, WaitHours: 0, CanaryCount: 123}, + }, + }, + }, + }, + }, + assertErr: require.Error, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/api/types/autoupdate/constants.go b/api/types/autoupdate/constants.go index 36f75a82f9f7a..0c38c7b190f42 100644 --- a/api/types/autoupdate/constants.go +++ b/api/types/autoupdate/constants.go @@ -47,5 +47,5 @@ const ( // MaxCanaryCount is the maximum number of canaries allowed for a single group. // This value is arbitrarily low to avoid XXL rollouts to grow over the max backend // item size. - MaxCanaryCount = 10 + MaxCanaryCount = 5 ) diff --git a/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx b/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx index dc61e5925073c..ddf0db9e8d221 100644 --- a/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx +++ b/docs/pages/reference/operator-resources/resources-teleport-dev-autoupdateconfigsv1.mdx @@ -51,6 +51,7 @@ resource, which you can apply after installing the Teleport Kubernetes operator. |Field|Type|Description| |---|---|---| +|canary_count|integer|canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.| |days|[]string|days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*"| |name|string|name of the group| |start_hour|integer|start_hour to initiate update| diff --git a/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx b/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx index 39a01d7553d7e..94336eb4cff48 100644 --- a/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx +++ b/docs/pages/reference/terraform-provider/data-sources/autoupdate_config.mdx @@ -53,6 +53,7 @@ Optional: Optional: +- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck. - `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*" - `name` (String) name of the group - `start_hour` (Number) start_hour to initiate update diff --git a/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx b/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx index 57e67d7dba440..414c896f9c5fe 100644 --- a/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx +++ b/docs/pages/reference/terraform-provider/resources/autoupdate_config.mdx @@ -91,6 +91,7 @@ Optional: Optional: +- `canary_count` (Number) canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck. - `days` (List of String) days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" and "*" - `name` (String) name of the group - `start_hour` (Number) start_hour to initiate update diff --git a/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml b/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml index f56da0ee7c415..8aaeff1d116ca 100644 --- a/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml +++ b/examples/chart/teleport-cluster/charts/teleport-operator/operator-crds/resources.teleport.dev_autoupdateconfigsv1.yaml @@ -60,6 +60,15 @@ spec: description: regular schedules for non-critical versions. items: properties: + canary_count: + description: canary_count is the number of canary agents + that will be updated before the whole group is updated. + when set to 0, the group does not enter the canary + phase. This number is capped to 5. This number must + always be lower than the total number of agents in + the group, else the rollout will be stuck. + format: int32 + type: integer days: description: days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", diff --git a/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml b/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml index f56da0ee7c415..8aaeff1d116ca 100644 --- a/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml +++ b/integrations/operator/config/crd/bases/resources.teleport.dev_autoupdateconfigsv1.yaml @@ -60,6 +60,15 @@ spec: description: regular schedules for non-critical versions. items: properties: + canary_count: + description: canary_count is the number of canary agents + that will be updated before the whole group is updated. + when set to 0, the group does not enter the canary + phase. This number is capped to 5. This number must + always be lower than the total number of agents in + the group, else the rollout will be stuck. + format: int32 + type: integer days: description: days when the update can run. Supported values are "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", diff --git a/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go b/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go index fe0cbc478dd30..8ed84a264a197 100644 --- a/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go +++ b/integrations/terraform/tfschema/autoupdate/v1/autoupdate_terraform.go @@ -117,6 +117,11 @@ func GenSchemaAutoUpdateConfig(ctx context.Context) (github_com_hashicorp_terraf "schedules": { Attributes: github_com_hashicorp_terraform_plugin_framework_tfsdk.SingleNestedAttributes(map[string]github_com_hashicorp_terraform_plugin_framework_tfsdk.Attribute{"regular": { Attributes: github_com_hashicorp_terraform_plugin_framework_tfsdk.ListNestedAttributes(map[string]github_com_hashicorp_terraform_plugin_framework_tfsdk.Attribute{ + "canary_count": { + Description: "canary_count is the number of canary agents that will be updated before the whole group is updated. when set to 0, the group does not enter the canary phase. This number is capped to 5. This number must always be lower than the total number of agents in the group, else the rollout will be stuck.", + Optional: true, + Type: github_com_hashicorp_terraform_plugin_framework_types.Int64Type, + }, "days": { Description: "days when the update can run. Supported values are \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\" and \"*\"", Optional: true, @@ -683,6 +688,23 @@ func CopyAutoUpdateConfigFromTerraform(_ context.Context, tf github_com_hashicor } } } + { + a, ok := tf.Attrs["canary_count"] + if !ok { + diags.Append(attrReadMissingDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count"}) + } else { + v, ok := a.(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + diags.Append(attrReadConversionFailureDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", "github.com/hashicorp/terraform-plugin-framework/types.Int64"}) + } else { + var t int32 + if !v.Null && !v.Unknown { + t = int32(v.Value) + } + obj.CanaryCount = t + } + } + } } obj.Regular[k] = t } @@ -1308,6 +1330,28 @@ func CopyAutoUpdateConfigToTerraform(ctx context.Context, obj *github_com_gravit tf.Attrs["wait_hours"] = v } } + { + t, ok := tf.AttrTypes["canary_count"] + if !ok { + diags.Append(attrWriteMissingDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count"}) + } else { + v, ok := tf.Attrs["canary_count"].(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + i, err := t.ValueFromTerraform(ctx, github_com_hashicorp_terraform_plugin_go_tftypes.NewValue(t.TerraformType(ctx), nil)) + if err != nil { + diags.Append(attrWriteGeneralError{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", err}) + } + v, ok = i.(github_com_hashicorp_terraform_plugin_framework_types.Int64) + if !ok { + diags.Append(attrWriteConversionFailureDiag{"AutoUpdateConfig.spec.agents.schedules.regular.canary_count", "github.com/hashicorp/terraform-plugin-framework/types.Int64"}) + } + v.Null = int64(obj.CanaryCount) == 0 + } + v.Value = int64(obj.CanaryCount) + v.Unknown = false + tf.Attrs["canary_count"] = v + } + } } v.Unknown = false c.Elems[k] = v diff --git a/lib/autoupdate/rollout/reconciler.go b/lib/autoupdate/rollout/reconciler.go index cddd023f4e9b2..0185893035acb 100644 --- a/lib/autoupdate/rollout/reconciler.go +++ b/lib/autoupdate/rollout/reconciler.go @@ -388,6 +388,7 @@ func (r *reconciler) makeGroupsStatus(ctx context.Context, schedules *autoupdate ConfigDays: group.Days, ConfigStartHour: group.StartHour, ConfigWaitHours: group.WaitHours, + CanaryCount: uint64(group.CanaryCount), } } return groups, nil diff --git a/lib/autoupdate/rollout/strategy.go b/lib/autoupdate/rollout/strategy.go index 3b3b524e54275..1bcfe90d4a5a6 100644 --- a/lib/autoupdate/rollout/strategy.go +++ b/lib/autoupdate/rollout/strategy.go @@ -97,8 +97,10 @@ func setGroupState(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, newState if previousState != newState { group.State = newState changed = true - // If we just started the group, also update the start time - if newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE || + // If we just started the group, also update the start time. + // If we are doing a canary -> active transition, we don't override the start date. + if (newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE && + previousState != autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY) || newState == autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY { group.StartTime = timestamppb.New(now) } diff --git a/lib/autoupdate/rollout/strategy_haltonerror.go b/lib/autoupdate/rollout/strategy_haltonerror.go index 51bbceb87729f..5741bdfe46136 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror.go +++ b/lib/autoupdate/rollout/strategy_haltonerror.go @@ -189,21 +189,16 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, spec *autoupd return nil } -const ( - canaryCount = 5 - canaryThreshold = 20 -) - -func shouldUseCanaries(currentCount int) bool { +func shouldUseCanaries(group *autoupdate.AutoUpdateAgentRolloutStatusGroup) bool { // in the future we might change this logic to be a multiple of the required canary count // and make the canary count dynamic - return currentCount >= canaryThreshold + return group.CanaryCount > 0 } func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time, agentCount int, status *autoupdate.AutoUpdateAgentRolloutStatus) { group.InitialCount = uint64(agentCount) - if !shouldUseCanaries(agentCount) { + if !shouldUseCanaries(group) { h.log.DebugContext(ctx, "Skipping canary rollout, transitioning directly to the active state", "group", group.Name) setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now) return @@ -218,9 +213,6 @@ func (h *haltOnErrorStrategy) startGroup(ctx context.Context, group *autoupdate. } func (h *haltOnErrorStrategy) sampleCanaries(ctx context.Context, group *autoupdate.AutoUpdateAgentRolloutStatusGroup, status *autoupdate.AutoUpdateAgentRolloutStatus) error { - if group.CanaryCount == 0 { - group.CanaryCount = canaryCount - } // Check if we need to pick more canaries if len(group.Canaries) < int(group.CanaryCount) { previousLength := len(group.Canaries) diff --git a/lib/autoupdate/rollout/strategy_haltonerror_test.go b/lib/autoupdate/rollout/strategy_haltonerror_test.go index 15a48c4d68f4c..8b7e9ecf284e4 100644 --- a/lib/autoupdate/rollout/strategy_haltonerror_test.go +++ b/lib/autoupdate/rollout/strategy_haltonerror_test.go @@ -837,6 +837,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { ConfigStartHour: matchingStartHour, PresentCount: 12, UpToDateCount: 3, + CanaryCount: 5, }, }, reports: canaryTestReports, @@ -872,9 +873,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { InitialCount: 34, PresentCount: 34, UpToDateCount: 10, - // Checking that if CanaryCount is not set/null (e.g. we came from a manual transition) - // We still set it instead of jumping to the active state. - CanaryCount: 0, + CanaryCount: 5, }, }, reports: canaryTestReports, @@ -940,6 +939,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + StartTime: timestamppb.New(clock.Now()), LastUpdateTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, @@ -978,6 +978,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, + StartTime: timestamppb.New(clock.Now()), LastUpdateTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, @@ -1058,6 +1059,7 @@ func Test_progressGroupsHaltOnError(t *testing.T) { Name: group1Name, State: autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY, LastUpdateTime: timestamppb.New(clock.Now()), + StartTime: timestamppb.New(clock.Now()), LastUpdateReason: updateReasonCanStart, ConfigDays: canStartToday, ConfigStartHour: matchingStartHour, diff --git a/lib/autoupdate/rollout/transitions.go b/lib/autoupdate/rollout/transitions.go index b9b67d183d6bd..22a15ac37aabc 100644 --- a/lib/autoupdate/rollout/transitions.go +++ b/lib/autoupdate/rollout/transitions.go @@ -118,7 +118,7 @@ func TriggerGroups(rollout *autoupdatev1pb.AutoUpdateAgentRollout, reports []*au switch desiredState { case autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSPECIFIED: - if shouldUseCanaries(initialCount) { + if shouldUseCanaries(group) { // We switch to the canary state but we don't sample canaries now. // Canary sampling will happen during the next reconciliation. desiredState = autoupdatev1pb.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_CANARY